From d1a9ba9a34f544d241b7595655e74a68c5b3f60b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 1 Dec 2012 13:42:08 -0500
Subject: [PATCH 01/77] Starting tag work.

- Translation dict, contexts, parse_* and handle_* hooks in tokenizer.
---
 mwparserfromhell/nodes/tag.py        | 36 +++++++++++++++++
 mwparserfromhell/parser/contexts.py  | 65 +++++++++++++++++++-----------
 mwparserfromhell/parser/tokenizer.c  |  1 -
 mwparserfromhell/parser/tokenizer.py | 77 +++++++++++++++++++++++++++++++++++-
 4 files changed, 155 insertions(+), 24 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index 5873a49..c32f398 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -73,6 +73,42 @@ class Tag(Node):
     TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE))
     TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE
 
+    TRANSLATIONS = {
+        "i": TAG_ITALIC,
+        "em": TAG_ITALIC,
+        "b": TAG_BOLD,
+        "strong": TAG_BOLD,
+        "u": TAG_UNDERLINE,
+        "s": TAG_STRIKETHROUGH,
+        "ul": TAG_UNORDERED_LIST,
+        "ol": TAG_ORDERED_LIST,
+        "dt": TAG_DEF_TERM,
+        "dd": TAG_DEF_ITEM,
+        "blockquote": TAG_BLOCKQUOTE,
+        "hl": TAG_RULE,
+        "br": TAG_BREAK,
+        "abbr": TAG_ABBR,
+        "pre": TAG_PRE,
+        "tt": TAG_MONOSPACE,
+        "code": TAG_CODE,
+        "span": TAG_SPAN,
+        "div": TAG_DIV,
+        "font": TAG_FONT,
+        "small": TAG_SMALL,
+        "big": TAG_BIG,
+        "center": TAG_CENTER,
+        "ref": TAG_REF,
+        "gallery": TAG_GALLERY,
+        "math": TAG_MATH,
+        "nowiki": TAG_NOWIKI,
+        "noinclude": TAG_NOINCLUDE,
+        "includeonly": TAG_INCLUDEONLY,
+        "onlyinclude": TAG_ONLYINCLUDE,
+        "syntaxhighlight": TAG_SYNTAXHIGHLIGHT,
+        "source": TAG_SYNTAXHIGHLIGHT,
+        "poem": TAG_POEM,
+    }
+
     def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
                  self_closing=False, open_padding=0, close_padding=0):
         super(Tag, self).__init__()
diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index 9d41870..a67bd76 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -62,35 +62,56 @@ Local (stack-specific) contexts:
 
 * :py:const:`COMMENT`
 
-Global contexts:
+* :py:const:`TAG`
 
-* :py:const:`GL_HEADING`
-"""
+    * :py:const:`TAG_OPEN`
+    * :py:const:`TAG_ATTR`
 
-# Local contexts:
+        * :py:const:`TAG_ATTR_NAME`
+        * :py:const:`TAG_ATTR_BODY`
+        * :py:const:`TAG_ATTR_BODY_QUOTED`
 
-TEMPLATE =              0b00000000000111
-TEMPLATE_NAME =         0b00000000000001
-TEMPLATE_PARAM_KEY =    0b00000000000010
-TEMPLATE_PARAM_VALUE =  0b00000000000100
+    * :py:const:`TAG_BODY`
+    * :py:const:`TAG_CLOSE`
 
-ARGUMENT =              0b00000000011000
-ARGUMENT_NAME =         0b00000000001000
-ARGUMENT_DEFAULT =      0b00000000010000
+Global contexts:
 
-WIKILINK =              0b00000001100000
-WIKILINK_TITLE =        0b00000000100000
-WIKILINK_TEXT =         0b00000001000000
+* :py:const:`GL_HEADING`
+"""
 
-HEADING =               0b01111110000000
-HEADING_LEVEL_1 =       0b00000010000000
-HEADING_LEVEL_2 =       0b00000100000000
-HEADING_LEVEL_3 =       0b00001000000000
-HEADING_LEVEL_4 =       0b00010000000000
-HEADING_LEVEL_5 =       0b00100000000000
-HEADING_LEVEL_6 =       0b01000000000000
+# Local contexts:
 
-COMMENT =               0b10000000000000
+TEMPLATE =              0b00000000000000000111
+TEMPLATE_NAME =         0b00000000000000000001
+TEMPLATE_PARAM_KEY =    0b00000000000000000010
+TEMPLATE_PARAM_VALUE =  0b00000000000000000100
+
+ARGUMENT =              0b00000000000000011000
+ARGUMENT_NAME =         0b00000000000000001000
+ARGUMENT_DEFAULT =      0b00000000000000010000
+
+WIKILINK =              0b00000000000001100000
+WIKILINK_TITLE =        0b00000000000000100000
+WIKILINK_TEXT =         0b00000000000001000000
+
+HEADING =               0b00000001111110000000
+HEADING_LEVEL_1 =       0b00000000000010000000
+HEADING_LEVEL_2 =       0b00000000000100000000
+HEADING_LEVEL_3 =       0b00000000001000000000
+HEADING_LEVEL_4 =       0b00000000010000000000
+HEADING_LEVEL_5 =       0b00000000100000000000
+HEADING_LEVEL_6 =       0b00000001000000000000
+
+COMMENT =               0b00000010000000000000
+
+TAG =                   0b11111100000000000000
+TAG_OPEN =              0b00000100000000000000
+TAG_ATTR =              0b00111000000000000000
+TAG_ATTR_NAME =         0b00001000000000000000
+TAG_ATTR_BODY =         0b00010000000000000000
+TAG_ATTR_BODY_QUOTED =  0b00100000000000000000
+TAG_BODY =              0b01000000000000000000
+TAG_CLOSE =             0b10000000000000000000
 
 
 # Global contexts:
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index cc1b4dd..71b6cc3 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -767,7 +767,6 @@ Tokenizer_parse_heading(Tokenizer* self)
         self->global ^= GL_HEADING;
         return 0;
     }
-
     level = PyInt_FromSsize_t(heading->level);
     if (!level) {
         Py_DECREF(heading->title);
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 5b0e976..f640aa2 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -27,6 +27,7 @@ import string
 
 from . import contexts
 from . import tokens
+from ..nodes.tag import Tag
 from ..compat import htmlentities
 
 __all__ = ["Tokenizer"]
@@ -420,6 +421,57 @@ class Tokenizer(object):
             self._write(tokens.CommentEnd())
             self._head += 2
 
+    def _parse_tag(self):
+        """Parse an HTML tag at the head of the wikicode string."""
+        self._head += 1
+        reset = self._head
+        self._push()
+        try:
+            t_open, type_, self_close, o_pad = self._parse(contexts.TAG_OPEN)
+            if not self_close:
+                t_body = self._parse(contexts.TAG_BODY)
+                t_close, c_pad = self._parse(contexts.TAG_CLOSE)
+        except BadRoute:
+            self._head = reset
+            self._pop()
+            self._write_text("<")
+        else:
+            self._pop()
+            self._write(tokens.TagOpenOpen(type=type_, showtag=False))
+            self._write_all(t_open)
+            if self_close:
+                self._write(tokens.TagCloseSelfclose(padding=o_pad))
+            else:
+                self._write(tokens.TagCloseOpen(padding=o_pad))
+                self._write_all(t_body)
+                self._write(tokens.TagOpenClose())
+                self._write_all(t_close)
+                self._write(tokens.TagCloseClose(padding=c_pad))
+
+    def _handle_attribute(self):
+        if not self._context & contexts.TAG_ATTR:
+            ## check name is valid
+
+    def _handle_attribute_name(self):
+        ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED
+        pass
+
+    def _handle_quoted_attribute_close(self):
+        pass
+
+    def _handle_tag_close_open(self):
+        pass  ## .padding
+
+    def _handle_tag_selfclose(self):
+        pass  ## .padding
+
+    def _handle_tag_close_open(self):
+        pass
+
+    def _handle_tag_close_close(self):
+        ## check that the closing name is the same as the opening name
+        pass  ## .padding
+
     def _parse(self, context=0):
         """Parse the wikicode string, using *context* for when to stop."""
         self._push(context)
@@ -432,7 +484,7 @@ class Tokenizer(object):
             if this is self.END:
                 fail = (contexts.TEMPLATE | contexts.ARGUMENT |
                         contexts.WIKILINK | contexts.HEADING |
-                        contexts.COMMENT)
+                        contexts.COMMENT | contexts.TAG)
                 if self._context & contexts.TEMPLATE_PARAM_KEY:
                     self._pop()
                 if self._context & fail:
@@ -484,6 +536,29 @@ class Tokenizer(object):
                     self._parse_comment()
                 else:
                     self._write_text(this)
+            elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY):
+                self._parse_tag()
+            elif this == " " and (self._context & contexts.TAG_OPEN and not
+                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                self._handle_attribute()
+            elif this == "=" and self._context & contexts.TAG_ATTR_NAME:
+                self._handle_attribute_name()
+            elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED:
+                self._handle_quoted_attribute_close()
+            elif this == "\n" and (self._context & contexts.TAG_OPEN and not
+                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                self._fail_route()
+            elif this == ">" and (self._context & contexts.TAG_ATTR_OPEN and not
+                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                return self._handle_tag_close_open()
+            elif this == "/" and next == ">" and (
+                            self._context & contexts.TAG_ATTR_OPEN and not
+                            self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                return self._handle_tag_selfclose()
+            elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
+                self._handle_tag_close_open()
+            elif this == ">" and self._context & contexts.TAG_CLOSE:
+                self._handle_tag_close_close()
             else:
                 self._write_text(this)
             self._head += 1

From 05ec7a1a92fdf2549e8722aabd4a36a4825f3227 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 8 Dec 2012 22:04:03 -0500
Subject: [PATCH 02/77] Improve padding support for Tags; more code for tags in
 tokenizer.

---
 mwparserfromhell/nodes/extras/attribute.py |  27 +++++--
 mwparserfromhell/nodes/tag.py              |  18 ++---
 mwparserfromhell/parser/tokenizer.py       | 116 ++++++++++++++++++-----------
 3 files changed, 100 insertions(+), 61 deletions(-)

diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py
index 648bca0..58a99a8 100644
--- a/mwparserfromhell/nodes/extras/attribute.py
+++ b/mwparserfromhell/nodes/extras/attribute.py
@@ -36,18 +36,20 @@ class Attribute(StringMixIn):
     whose value is ``"foo"``.
     """
 
-    def __init__(self, name, value=None, quoted=True):
+    def __init__(self, name, value=None, quoted=True, padding=""):
         super(Attribute, self).__init__()
         self._name = name
         self._value = value
         self._quoted = quoted
+        self._padding = padding
 
     def __unicode__(self):
+        base = self.padding + str(self.name)
         if self.value:
             if self.quoted:
-                return str(self.name) + '="' + str(self.value) + '"'
-            return str(self.name) + "=" + str(self.value)
-        return str(self.name)
+                return base + '="' + str(self.value) + '"'
+            return base + "=" + str(self.value)
+        return base
 
     @property
     def name(self):
@@ -64,14 +66,23 @@ class Attribute(StringMixIn):
         """Whether the attribute's value is quoted with double quotes."""
         return self._quoted
 
+    @property
+    def padding(self):
+        """Spacing to insert right before the attribute."""
+        return self._padding
+
     @name.setter
-    def name(self, newval):
-        self._name = parse_anything(newval)
+    def name(self, value):
+        self._name = parse_anything(value)
 
     @value.setter
     def value(self, newval):
         self._value = parse_anything(newval)
 
     @quoted.setter
-    def quoted(self, newval):
-        self._quoted = bool(newval)
+    def quoted(self, value):
+        self._quoted = bool(value)
+
+    @padding.setter
+    def padding(self, value):
+        self._padding = str(value)
diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index c32f398..681a17a 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -110,7 +110,7 @@ class Tag(Node):
     }
 
     def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
-                 self_closing=False, open_padding=0, close_padding=0):
+                 self_closing=False, open_padding="", close_padding=""):
         super(Tag, self).__init__()
         self._type = type_
         self._tag = tag
@@ -136,10 +136,10 @@ class Tag(Node):
         if self.attrs:
             result += " " + " ".join([str(attr) for attr in self.attrs])
         if self.self_closing:
-            result += " " * self.open_padding + "/>"
+            result += self.open_padding + "/>"
         else:
-            result += " " * self.open_padding + ">" + str(self.contents)
-            result += "</" + str(self.tag) + " " * self.close_padding + ">"
+            result += self.open_padding + ">" + str(self.contents)
+            result += "</" + str(self.tag) + self.close_padding + ">"
         return result
 
     def __iternodes__(self, getter):
@@ -232,17 +232,17 @@ class Tag(Node):
 
     @property
     def self_closing(self):
-        """Whether the tag is self-closing with no content."""
+        """Whether the tag is self-closing with no content (like ``<br/>``)."""
         return self._self_closing
 
     @property
     def open_padding(self):
-        """How much spacing to insert before the first closing >."""
+        """Spacing to insert before the first closing >."""
         return self._open_padding
 
     @property
     def close_padding(self):
-        """How much spacing to insert before the last closing >."""
+        """Spacing to insert before the last closing > (excl. self-closing)."""
         return self._close_padding
 
     @type.setter
@@ -270,8 +270,8 @@ class Tag(Node):
 
     @open_padding.setter
     def open_padding(self, value):
-        self._open_padding = int(value)
+        self._open_padding = str(value)
 
     @close_padding.setter
     def close_padding(self, value):
-        self._close_padding = int(value)
+        self._close_padding = str(value)
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index f640aa2..80d7610 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -425,52 +425,77 @@ class Tokenizer(object):
         """Parse an HTML tag at the head of the wikicode string."""
         self._head += 1
         reset = self._head
-        self._push()
         try:
-            t_open, type_, self_close, o_pad = self._parse(contexts.TAG_OPEN)
-            if not self_close:
-                t_body = self._parse(contexts.TAG_BODY)
-                t_close, c_pad = self._parse(contexts.TAG_CLOSE)
+            tokens = self._parse(contexts.TAG_OPEN)
         except BadRoute:
             self._head = reset
-            self._pop()
             self._write_text("<")
         else:
-            self._pop()
-            self._write(tokens.TagOpenOpen(type=type_, showtag=False))
-            self._write_all(t_open)
-            if self_close:
-                self._write(tokens.TagCloseSelfclose(padding=o_pad))
-            else:
-                self._write(tokens.TagCloseOpen(padding=o_pad))
-                self._write_all(t_body)
-                self._write(tokens.TagOpenClose())
-                self._write_all(t_close)
-                self._write(tokens.TagCloseClose(padding=c_pad))
+            self._write_all(tokens)
 
-    def _handle_attribute(self):
-        if not self._context & contexts.TAG_ATTR:
-            ## check name is valid
+    def _get_tag_type_from_stack(self):
+        self._push_textbuffer()
+        if not self._stack:
+            return None  # Tag has an empty name?
+        text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
+        text = "".join([token.text for token in text]).strip().lower()
+        try:
+            return Tag.TRANSLATIONS[text]
+        except KeyError:
+            return Tag.TAG_UNKNOWN
+
+    def _handle_tag_close_name(self):
+        tag = self._get_tag_type_from_stack()
+        if tag is None:
+            self._fail_route()
+        self._write(tokens.TagOpenOpen(type=tag, showtag=False))
 
-    def _handle_attribute_name(self):
-        ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED
-        pass
+    # def _handle_attribute(self):
+    #     if not self._context & contexts.TAG_ATTR:
+    #         self._handle_tag_close_name()
 
-    def _handle_quoted_attribute_close(self):
-        pass
+    # def _handle_attribute_name(self):
+    #     ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED
+    #     pass
+
+    # def _handle_quoted_attribute_close(self):
+    #     pass
 
     def _handle_tag_close_open(self):
-        pass  ## .padding
+        if not self._context & contexts.TAG_ATTR:
+            self._handle_tag_close_name()
+
+        self._context ^= contexts.TAG_OPEN  # also TAG_ATTR_*
+        self._context |= contexts.TAG_BODY
+
+        padding = ""                                                                # TODO
+        self._write(tokens.TagCloseOpen(padding=padding))
 
     def _handle_tag_selfclose(self):
-        pass  ## .padding
+        self._context ^= contexts.TAG_OPEN  # also TAG_ATTR_*
+        self._context |= contexts.TAG_BODY
 
-    def _handle_tag_close_open(self):
-        pass
+        padding = ""                                                                # TODO
+        self._write(tokens.TagCloseSelfclose(padding=padding))
+        self._pop()
+
+    def _handle_tag_open_close(self):
+        self._context ^= contexts.TAG_BODY
+        self._context |= contexts.TAG_CLOSE
+        self._write(tokens.TagOpenClose())
+        self._push()
+        self._head += 1
 
     def _handle_tag_close_close(self):
-        ## check that the closing name is the same as the opening name
-        pass  ## .padding
+        tag = self._get_tag_type_from_stack()
+        closing = self._pop()
+        if tag != self._stack[0].type:
+            # Closing and opening tags are not the same, so fail this route:
+            self._fail_route()
+        self._write_all(closing)
+        padding = ""                                                                # TODO
+        self._write(tokens.TagCloseClose(padding=padding))
+        return self._pop()
 
     def _parse(self, context=0):
         """Parse the wikicode string, using *context* for when to stop."""
@@ -485,7 +510,8 @@ class Tokenizer(object):
                 fail = (contexts.TEMPLATE | contexts.ARGUMENT |
                         contexts.WIKILINK | contexts.HEADING |
                         contexts.COMMENT | contexts.TAG)
-                if self._context & contexts.TEMPLATE_PARAM_KEY:
+                double_fail = contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE
+                if self._context & double_fail:
                     self._pop()
                 if self._context & fail:
                     self._fail_route()
@@ -538,27 +564,29 @@ class Tokenizer(object):
                     self._write_text(this)
             elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY):
                 self._parse_tag()
-            elif this == " " and (self._context & contexts.TAG_OPEN and not
-                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
-                self._handle_attribute()
-            elif this == "=" and self._context & contexts.TAG_ATTR_NAME:
-                self._handle_attribute_name()
-            elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED:
-                self._handle_quoted_attribute_close()
+            # elif this == " " and (self._context & contexts.TAG_OPEN and not
+            #                       self._context & contexts.TAG_ATTR_BODY_QUOTED):
+            #     self._handle_attribute()
+            # elif this == "=" and self._context & contexts.TAG_ATTR_NAME:
+            #     self._handle_attribute_name()
+            # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED:
+            #     self._handle_quoted_attribute_close()
             elif this == "\n" and (self._context & contexts.TAG_OPEN and not
                                   self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                if self._context & contexts.TAG_CLOSE:
+                    self._pop()
                 self._fail_route()
-            elif this == ">" and (self._context & contexts.TAG_ATTR_OPEN and not
+            elif this == ">" and (self._context & contexts.TAG_OPEN and not
                                   self._context & contexts.TAG_ATTR_BODY_QUOTED):
-                return self._handle_tag_close_open()
+                self._handle_tag_close_open()
             elif this == "/" and next == ">" and (
-                            self._context & contexts.TAG_ATTR_OPEN and not
+                            self._context & contexts.TAG_OPEN and not
                             self._context & contexts.TAG_ATTR_BODY_QUOTED):
                 return self._handle_tag_selfclose()
             elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
-                self._handle_tag_close_open()
+                self._handle_tag_open_close()
             elif this == ">" and self._context & contexts.TAG_CLOSE:
-                self._handle_tag_close_close()
+                return self._handle_tag_close_close()
             else:
                 self._write_text(this)
             self._head += 1

From 7e46601b1d358a09dfa8641b03d6bb2a5eeb63c3 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 9 Dec 2012 00:20:21 -0500
Subject: [PATCH 03/77] Tags should fully work now in tokenizer and builder.

Still need to do attributes.
---
 mwparserfromhell/nodes/tag.py        |  5 +--
 mwparserfromhell/parser/builder.py   |  2 ++
 mwparserfromhell/parser/tokenizer.py | 62 ++++++++++++++++++++----------------
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index 681a17a..48effa1 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -70,8 +70,9 @@ class Tag(Node):
     TAG_POEM = 202
 
     # Lists of tags:
+    TAGS_ALL = set(range(300))
     TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE))
-    TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE
+    TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE
 
     TRANSLATIONS = {
         "i": TAG_ITALIC,
@@ -248,7 +249,7 @@ class Tag(Node):
     @type.setter
     def type(self, value):
         value = int(value)
-        if value not in self.TAGS_INVISIBLE | self.TAGS_VISIBLE:
+        if value not in self.TAGS_ALL:
             raise ValueError(value)
         self._type = value
 
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 61a8209..648842c 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -219,7 +219,9 @@ class Builder(object):
                            self_closing=True, open_padding=token.padding)
             elif isinstance(token, tokens.TagOpenClose):
                 contents = self._pop()
+                self._push()
             elif isinstance(token, tokens.TagCloseClose):
+                self._pop()
                 return Tag(type_, tag, contents, attrs, showtag, False,
                            open_pad, token.padding)
             else:
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 80d7610..2e72951 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -423,8 +423,8 @@ class Tokenizer(object):
 
     def _parse_tag(self):
         """Parse an HTML tag at the head of the wikicode string."""
-        self._head += 1
         reset = self._head
+        self._head += 1
         try:
             tokens = self._parse(contexts.TAG_OPEN)
         except BadRoute:
@@ -444,11 +444,24 @@ class Tokenizer(object):
         except KeyError:
             return Tag.TAG_UNKNOWN
 
-    def _handle_tag_close_name(self):
-        tag = self._get_tag_type_from_stack()
-        if tag is None:
-            self._fail_route()
-        self._write(tokens.TagOpenOpen(type=tag, showtag=False))
+    def _actually_close_tag_opening(self):
+        if self._context & contexts.TAG_ATTR:
+            if self._context & contexts.TAG_ATTR_BODY:
+                self._context ^= contexts.TAG_ATTR_BODY
+                if self._context & contexts.TAG_ATTR_BODY_QUOTED:
+                    self._context ^= contexts.TAG_ATTR_BODY_QUOTED
+            else:
+                self._context ^= contexts.TAG_ATTR_NAME
+        else:
+            tag = self._get_tag_type_from_stack()
+            if tag is None:
+                self._fail_route()
+            self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
+
+        self._context ^= contexts.TAG_OPEN
+        self._context |= contexts.TAG_BODY
+        padding = ""                                                                # TODO
+        return padding
 
     # def _handle_attribute(self):
     #     if not self._context & contexts.TAG_ATTR:
@@ -462,28 +475,18 @@ class Tokenizer(object):
     #     pass
 
     def _handle_tag_close_open(self):
-        if not self._context & contexts.TAG_ATTR:
-            self._handle_tag_close_name()
-
-        self._context ^= contexts.TAG_OPEN  # also TAG_ATTR_*
-        self._context |= contexts.TAG_BODY
-
-        padding = ""                                                                # TODO
+        padding = self._actually_close_tag_opening()
         self._write(tokens.TagCloseOpen(padding=padding))
 
     def _handle_tag_selfclose(self):
-        self._context ^= contexts.TAG_OPEN  # also TAG_ATTR_*
-        self._context |= contexts.TAG_BODY
-
-        padding = ""                                                                # TODO
+        padding = self._actually_close_tag_opening()
         self._write(tokens.TagCloseSelfclose(padding=padding))
-        self._pop()
+        self._head += 1
+        return self._pop()
 
     def _handle_tag_open_close(self):
-        self._context ^= contexts.TAG_BODY
-        self._context |= contexts.TAG_CLOSE
         self._write(tokens.TagOpenClose())
-        self._push()
+        self._push(contexts.TAG_CLOSE)
         self._head += 1
 
     def _handle_tag_close_close(self):
@@ -562,7 +565,8 @@ class Tokenizer(object):
                     self._parse_comment()
                 else:
                     self._write_text(this)
-            elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY):
+            elif this == "<" and next != "/" and (
+                    not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
                 self._parse_tag()
             # elif this == " " and (self._context & contexts.TAG_OPEN and not
             #                       self._context & contexts.TAG_ATTR_BODY_QUOTED):
@@ -571,17 +575,19 @@ class Tokenizer(object):
             #     self._handle_attribute_name()
             # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED:
             #     self._handle_quoted_attribute_close()
-            elif this == "\n" and (self._context & contexts.TAG_OPEN and not
-                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
+            elif this == "\n" and (
+                                self._context & contexts.TAG_OPEN and not
+                                self._context & contexts.TAG_ATTR_BODY_QUOTED):
                 if self._context & contexts.TAG_CLOSE:
                     self._pop()
                 self._fail_route()
-            elif this == ">" and (self._context & contexts.TAG_OPEN and not
-                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
+            elif this == ">" and (
+                                self._context & contexts.TAG_OPEN and not
+                                self._context & contexts.TAG_ATTR_BODY_QUOTED):
                 self._handle_tag_close_open()
             elif this == "/" and next == ">" and (
-                            self._context & contexts.TAG_OPEN and not
-                            self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                                self._context & contexts.TAG_OPEN and not
+                                self._context & contexts.TAG_ATTR_BODY_QUOTED):
                 return self._handle_tag_selfclose()
             elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
                 self._handle_tag_open_close()

From f78bcf832a08b81d7a9a03f344d2bd82bf97b6c0 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 9 Dec 2012 00:29:37 -0500
Subject: [PATCH 04/77] Keep .type and .tag synchronized in Tags when using
 their setters.

---
 mwparserfromhell/nodes/tag.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index 48effa1..b1eb133 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -252,10 +252,17 @@ class Tag(Node):
         if value not in self.TAGS_ALL:
             raise ValueError(value)
         self._type = value
+        for key in self.TRANSLATIONS:
+            if self.TRANSLATIONS[key] == value:
+                self._tag = parse_anything(key)
 
     @tag.setter
     def tag(self, value):
         self._tag = parse_anything(value)
+        try:
+            self._type = self.TRANSLATIONS[text]
+        except KeyError:
+            self._type = self.TAG_UNKNOWN
 
     @contents.setter
     def contents(self, value):

From 827c544721e223c2f9a5eaf90d5742b2d45de449 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 9 Dec 2012 01:38:45 -0500
Subject: [PATCH 05/77] Should correctly handle closing tags with strange
 spacing.

---
 mwparserfromhell/nodes/tag.py        | 29 ++++++++++++++++++-----------
 mwparserfromhell/parser/builder.py   |  3 +--
 mwparserfromhell/parser/tokenizer.py |  6 +++---
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index b1eb133..1f3bdf9 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -111,7 +111,7 @@ class Tag(Node):
     }
 
     def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
-                 self_closing=False, open_padding="", close_padding=""):
+                 self_closing=False, open_padding="", closing_tag=None):
         super(Tag, self).__init__()
         self._type = type_
         self._tag = tag
@@ -123,7 +123,10 @@ class Tag(Node):
         self._showtag = showtag
         self._self_closing = self_closing
         self._open_padding = open_padding
-        self._close_padding = close_padding
+        if closing_tag:
+            self._closing_tag = closing_tag
+        else:
+            self._closing_tag = tag
 
     def __unicode__(self):
         if not self.showtag:
@@ -140,7 +143,7 @@ class Tag(Node):
             result += self.open_padding + "/>"
         else:
             result += self.open_padding + ">" + str(self.contents)
-            result += "</" + str(self.tag) + self.close_padding + ">"
+            result += "</" + self.closing_tag + ">"
         return result
 
     def __iternodes__(self, getter):
@@ -242,9 +245,13 @@ class Tag(Node):
         return self._open_padding
 
     @property
-    def close_padding(self):
-        """Spacing to insert before the last closing > (excl. self-closing)."""
-        return self._close_padding
+    def closing_tag(self):
+        """The closing tag, as a :py:class:`~.Wikicode` object.
+
+        This will usually equal :py:attr:`tag`, unless there is additional
+        spacing, comments, or the like.
+        """
+        return self._closing_tag
 
     @type.setter
     def type(self, value):
@@ -254,11 +261,11 @@ class Tag(Node):
         self._type = value
         for key in self.TRANSLATIONS:
             if self.TRANSLATIONS[key] == value:
-                self._tag = parse_anything(key)
+                self._tag = self._closing_tag = parse_anything(key)
 
     @tag.setter
     def tag(self, value):
-        self._tag = parse_anything(value)
+        self._tag = self._closing_tag = parse_anything(value)
         try:
             self._type = self.TRANSLATIONS[text]
         except KeyError:
@@ -280,6 +287,6 @@ class Tag(Node):
     def open_padding(self, value):
         self._open_padding = str(value)
 
-    @close_padding.setter
-    def close_padding(self, value):
-        self._close_padding = str(value)
+    @closing_tag.setter
+    def closing_tag(self, value):
+        self._closing_tag = parse_anything(value)
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 648842c..90274fa 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -221,9 +221,8 @@ class Builder(object):
                 contents = self._pop()
                 self._push()
             elif isinstance(token, tokens.TagCloseClose):
-                self._pop()
                 return Tag(type_, tag, contents, attrs, showtag, False,
-                           open_pad, token.padding)
+                           open_pad, self._pop())
             else:
                 self._write(self._handle_token(token))
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 2e72951..9e9465d 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -496,8 +496,7 @@ class Tokenizer(object):
             # Closing and opening tags are not the same, so fail this route:
             self._fail_route()
         self._write_all(closing)
-        padding = ""                                                                # TODO
-        self._write(tokens.TagCloseClose(padding=padding))
+        self._write(tokens.TagCloseClose())
         return self._pop()
 
     def _parse(self, context=0):
@@ -589,7 +588,8 @@ class Tokenizer(object):
                                 self._context & contexts.TAG_OPEN and not
                                 self._context & contexts.TAG_ATTR_BODY_QUOTED):
                 return self._handle_tag_selfclose()
-            elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
+            elif this == "<" and next == "/" and (
+                                        self._context & contexts.TAG_BODY):
                 self._handle_tag_open_close()
             elif this == ">" and self._context & contexts.TAG_CLOSE:
                 return self._handle_tag_close_close()

From a21c69fa1e0fc6111b98a5028e8c214f21139dd0 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 9 Dec 2012 01:47:41 -0500
Subject: [PATCH 06/77] Split off tag definitions into a new file.

---
 mwparserfromhell/nodes/tag.py | 104 ++-----------------------------------
 mwparserfromhell/tag_defs.py  | 118 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 100 deletions(-)
 create mode 100644 mwparserfromhell/tag_defs.py

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index 1f3bdf9..ea98bb6 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -24,92 +24,14 @@ from __future__ import unicode_literals
 
 from . import Node, Text
 from ..compat import str
+from ..tag_defs import TagDefinitions
 from ..utils import parse_anything
 
 __all__ = ["Tag"]
 
-class Tag(Node):
+class Tag(TagDefinitions, Node):
     """Represents an HTML-style tag in wikicode, like ``<ref>``."""
 
-    TAG_UNKNOWN = 0
-
-    # Basic HTML:
-    TAG_ITALIC = 1
-    TAG_BOLD = 2
-    TAG_UNDERLINE = 3
-    TAG_STRIKETHROUGH = 4
-    TAG_UNORDERED_LIST = 5
-    TAG_ORDERED_LIST = 6
-    TAG_DEF_TERM = 7
-    TAG_DEF_ITEM = 8
-    TAG_BLOCKQUOTE = 9
-    TAG_RULE = 10
-    TAG_BREAK = 11
-    TAG_ABBR = 12
-    TAG_PRE = 13
-    TAG_MONOSPACE = 14
-    TAG_CODE = 15
-    TAG_SPAN = 16
-    TAG_DIV = 17
-    TAG_FONT = 18
-    TAG_SMALL = 19
-    TAG_BIG = 20
-    TAG_CENTER = 21
-
-    # MediaWiki parser hooks:
-    TAG_REF = 101
-    TAG_GALLERY = 102
-    TAG_MATH = 103
-    TAG_NOWIKI = 104
-    TAG_NOINCLUDE = 105
-    TAG_INCLUDEONLY = 106
-    TAG_ONLYINCLUDE = 107
-
-    # Additional parser hooks:
-    TAG_SYNTAXHIGHLIGHT = 201
-    TAG_POEM = 202
-
-    # Lists of tags:
-    TAGS_ALL = set(range(300))
-    TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE))
-    TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE
-
-    TRANSLATIONS = {
-        "i": TAG_ITALIC,
-        "em": TAG_ITALIC,
-        "b": TAG_BOLD,
-        "strong": TAG_BOLD,
-        "u": TAG_UNDERLINE,
-        "s": TAG_STRIKETHROUGH,
-        "ul": TAG_UNORDERED_LIST,
-        "ol": TAG_ORDERED_LIST,
-        "dt": TAG_DEF_TERM,
-        "dd": TAG_DEF_ITEM,
-        "blockquote": TAG_BLOCKQUOTE,
-        "hl": TAG_RULE,
-        "br": TAG_BREAK,
-        "abbr": TAG_ABBR,
-        "pre": TAG_PRE,
-        "tt": TAG_MONOSPACE,
-        "code": TAG_CODE,
-        "span": TAG_SPAN,
-        "div": TAG_DIV,
-        "font": TAG_FONT,
-        "small": TAG_SMALL,
-        "big": TAG_BIG,
-        "center": TAG_CENTER,
-        "ref": TAG_REF,
-        "gallery": TAG_GALLERY,
-        "math": TAG_MATH,
-        "nowiki": TAG_NOWIKI,
-        "noinclude": TAG_NOINCLUDE,
-        "includeonly": TAG_INCLUDEONLY,
-        "onlyinclude": TAG_ONLYINCLUDE,
-        "syntaxhighlight": TAG_SYNTAXHIGHLIGHT,
-        "source": TAG_SYNTAXHIGHLIGHT,
-        "poem": TAG_POEM,
-    }
-
     def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
                  self_closing=False, open_padding="", closing_tag=None):
         super(Tag, self).__init__()
@@ -130,7 +52,7 @@ class Tag(Node):
 
     def __unicode__(self):
         if not self.showtag:
-            open_, close = self._translate()
+            open_, close = self.WIKICODE[self.type]
             if self.self_closing:
                 return open_
             else:
@@ -188,24 +110,6 @@ class Tag(Node):
             get(self.tag)
             write(">")
 
-    def _translate(self):
-        """If the HTML-style tag has a wikicode representation, return that.
-
-        For example, ``<b>Foo</b>`` can be represented as ``'''Foo'''``. This
-        returns a tuple of the character starting the sequence and the
-        character ending it.
-        """
-        translations = {
-            self.TAG_ITALIC: ("''", "''"),
-            self.TAG_BOLD: ("'''", "'''"),
-            self.TAG_UNORDERED_LIST: ("*", ""),
-            self.TAG_ORDERED_LIST: ("#", ""),
-            self.TAG_DEF_TERM: (";", ""),
-            self.TAG_DEF_ITEM: (":", ""),
-            self.TAG_RULE: ("----", ""),
-        }
-        return translations[self.type]
-
     @property
     def type(self):
         """The tag type."""
@@ -241,7 +145,7 @@ class Tag(Node):
 
     @property
     def open_padding(self):
-        """Spacing to insert before the first closing >."""
+        """Spacing to insert before the first closing ``>``."""
         return self._open_padding
 
     @property
diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py
new file mode 100644
index 0000000..74d3a81
--- /dev/null
+++ b/mwparserfromhell/tag_defs.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import unicode_literals
+
+class TagDefinitions(object):
+    """Contains numerical definitions for valid HTML (and wikicode) tags.
+
+    Base class for :py:class:`~.Tag` objects.
+    """
+
+    TAG_UNKNOWN = 0
+
+    # Basic HTML:
+    TAG_ITALIC = 1
+    TAG_BOLD = 2
+    TAG_UNDERLINE = 3
+    TAG_STRIKETHROUGH = 4
+    TAG_UNORDERED_LIST = 5
+    TAG_ORDERED_LIST = 6
+    TAG_DEF_TERM = 7
+    TAG_DEF_ITEM = 8
+    TAG_BLOCKQUOTE = 9
+    TAG_RULE = 10
+    TAG_BREAK = 11
+    TAG_ABBR = 12
+    TAG_PRE = 13
+    TAG_MONOSPACE = 14
+    TAG_CODE = 15
+    TAG_SPAN = 16
+    TAG_DIV = 17
+    TAG_FONT = 18
+    TAG_SMALL = 19
+    TAG_BIG = 20
+    TAG_CENTER = 21
+
+    # MediaWiki parser hooks:
+    TAG_REF = 101
+    TAG_GALLERY = 102
+    TAG_MATH = 103
+    TAG_NOWIKI = 104
+    TAG_NOINCLUDE = 105
+    TAG_INCLUDEONLY = 106
+    TAG_ONLYINCLUDE = 107
+
+    # Additional parser hooks:
+    TAG_SYNTAXHIGHLIGHT = 201
+    TAG_POEM = 202
+
+    # Lists of tags:
+    TAGS_ALL = set(range(300))
+    TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE}
+    TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE
+
+    TRANSLATIONS = {
+        "i": TAG_ITALIC,
+        "em": TAG_ITALIC,
+        "b": TAG_BOLD,
+        "strong": TAG_BOLD,
+        "u": TAG_UNDERLINE,
+        "s": TAG_STRIKETHROUGH,
+        "ul": TAG_UNORDERED_LIST,
+        "ol": TAG_ORDERED_LIST,
+        "dt": TAG_DEF_TERM,
+        "dd": TAG_DEF_ITEM,
+        "blockquote": TAG_BLOCKQUOTE,
+        "hl": TAG_RULE,
+        "br": TAG_BREAK,
+        "abbr": TAG_ABBR,
+        "pre": TAG_PRE,
+        "tt": TAG_MONOSPACE,
+        "code": TAG_CODE,
+        "span": TAG_SPAN,
+        "div": TAG_DIV,
+        "font": TAG_FONT,
+        "small": TAG_SMALL,
+        "big": TAG_BIG,
+        "center": TAG_CENTER,
+        "ref": TAG_REF,
+        "gallery": TAG_GALLERY,
+        "math": TAG_MATH,
+        "nowiki": TAG_NOWIKI,
+        "noinclude": TAG_NOINCLUDE,
+        "includeonly": TAG_INCLUDEONLY,
+        "onlyinclude": TAG_ONLYINCLUDE,
+        "syntaxhighlight": TAG_SYNTAXHIGHLIGHT,
+        "source": TAG_SYNTAXHIGHLIGHT,
+        "poem": TAG_POEM,
+    }
+
+    WIKICODE = {
+        TAG_ITALIC: ("''", "''"),
+        TAG_BOLD: ("'''", "'''"),
+        TAG_UNORDERED_LIST: ("*", ""),
+        TAG_ORDERED_LIST: ("#", ""),
+        TAG_DEF_TERM: (";", ""),
+        TAG_DEF_ITEM: (":", ""),
+        TAG_RULE: ("----", ""),
+    }

From 252cc13a998d60d8a8daf89dc3aa53e5f9bdde27 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 9 Dec 2012 02:01:23 -0500
Subject: [PATCH 07/77] Move repeated context checks into one block in
 Tokenizer._parse().

---
 mwparserfromhell/parser/tokenizer.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 9e9465d..99f5a7b 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -574,20 +574,18 @@ class Tokenizer(object):
             #     self._handle_attribute_name()
             # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED:
             #     self._handle_quoted_attribute_close()
-            elif this == "\n" and (
-                                self._context & contexts.TAG_OPEN and not
-                                self._context & contexts.TAG_ATTR_BODY_QUOTED):
-                if self._context & contexts.TAG_CLOSE:
-                    self._pop()
-                self._fail_route()
-            elif this == ">" and (
-                                self._context & contexts.TAG_OPEN and not
-                                self._context & contexts.TAG_ATTR_BODY_QUOTED):
-                self._handle_tag_close_open()
-            elif this == "/" and next == ">" and (
-                                self._context & contexts.TAG_OPEN and not
-                                self._context & contexts.TAG_ATTR_BODY_QUOTED):
-                return self._handle_tag_selfclose()
+            elif self._context & contexts.TAG_OPEN and (
+                            not self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                if this == "\n":
+                    if self._context & contexts.TAG_CLOSE:
+                        self._pop()
+                    self._fail_route()
+                elif this == ">":
+                    self._handle_tag_close_open()
+                elif this == "/":
+                    return self._handle_tag_selfclose()
+                else:
+                    self._write_text(this)
             elif this == "<" and next == "/" and (
                                         self._context & contexts.TAG_BODY):
                 self._handle_tag_open_close()

From d9f23b8faaedb94d667372fb2a892307cf15a38a Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 22 Dec 2012 21:58:21 -0500
Subject: [PATCH 08/77] Really basic, messy, and fragile tag attribute support.

---
 mwparserfromhell/parser/contexts.py  | 73 +++++++++++++++++++-----------------
 mwparserfromhell/parser/tokenizer.py | 65 ++++++++++++++++++--------------
 2 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index a67bd76..053c930 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -65,11 +65,13 @@ Local (stack-specific) contexts:
 * :py:const:`TAG`
 
     * :py:const:`TAG_OPEN`
-    * :py:const:`TAG_ATTR`
 
-        * :py:const:`TAG_ATTR_NAME`
-        * :py:const:`TAG_ATTR_BODY`
-        * :py:const:`TAG_ATTR_BODY_QUOTED`
+        * :py:const:`TAG_OPEN_NAME`
+        * :py:const:`TAG_OPEN_ATTR`
+
+            * :py:const:`TAG_OPEN_ATTR_NAME`
+            * :py:const:`TAG_OPEN_ATTR_BODY`
+            * :py:const:`TAG_OPEN_ATTR_BODY_QUOTED`
 
     * :py:const:`TAG_BODY`
     * :py:const:`TAG_CLOSE`
@@ -81,37 +83,38 @@ Global contexts:
 
 # Local contexts:
 
-TEMPLATE =              0b00000000000000000111
-TEMPLATE_NAME =         0b00000000000000000001
-TEMPLATE_PARAM_KEY =    0b00000000000000000010
-TEMPLATE_PARAM_VALUE =  0b00000000000000000100
-
-ARGUMENT =              0b00000000000000011000
-ARGUMENT_NAME =         0b00000000000000001000
-ARGUMENT_DEFAULT =      0b00000000000000010000
-
-WIKILINK =              0b00000000000001100000
-WIKILINK_TITLE =        0b00000000000000100000
-WIKILINK_TEXT =         0b00000000000001000000
-
-HEADING =               0b00000001111110000000
-HEADING_LEVEL_1 =       0b00000000000010000000
-HEADING_LEVEL_2 =       0b00000000000100000000
-HEADING_LEVEL_3 =       0b00000000001000000000
-HEADING_LEVEL_4 =       0b00000000010000000000
-HEADING_LEVEL_5 =       0b00000000100000000000
-HEADING_LEVEL_6 =       0b00000001000000000000
-
-COMMENT =               0b00000010000000000000
-
-TAG =                   0b11111100000000000000
-TAG_OPEN =              0b00000100000000000000
-TAG_ATTR =              0b00111000000000000000
-TAG_ATTR_NAME =         0b00001000000000000000
-TAG_ATTR_BODY =         0b00010000000000000000
-TAG_ATTR_BODY_QUOTED =  0b00100000000000000000
-TAG_BODY =              0b01000000000000000000
-TAG_CLOSE =             0b10000000000000000000
+TEMPLATE =                  0b00000000000000000111
+TEMPLATE_NAME =             0b00000000000000000001
+TEMPLATE_PARAM_KEY =        0b00000000000000000010
+TEMPLATE_PARAM_VALUE =      0b00000000000000000100
+
+ARGUMENT =                  0b00000000000000011000
+ARGUMENT_NAME =             0b00000000000000001000
+ARGUMENT_DEFAULT =          0b00000000000000010000
+
+WIKILINK =                  0b00000000000001100000
+WIKILINK_TITLE =            0b00000000000000100000
+WIKILINK_TEXT =             0b00000000000001000000
+
+HEADING =                   0b00000001111110000000
+HEADING_LEVEL_1 =           0b00000000000010000000
+HEADING_LEVEL_2 =           0b00000000000100000000
+HEADING_LEVEL_3 =           0b00000000001000000000
+HEADING_LEVEL_4 =           0b00000000010000000000
+HEADING_LEVEL_5 =           0b00000000100000000000
+HEADING_LEVEL_6 =           0b00000001000000000000
+
+COMMENT =                   0b00000010000000000000
+
+TAG =                       0b11111100000000000000
+TAG_OPEN =                  0b00111100000000000000
+TAG_OPEN_NAME =             0b00000100000000000000
+TAG_OPEN_ATTR =             0b00111000000000000000
+TAG_OPEN_ATTR_NAME =        0b00001000000000000000
+TAG_OPEN_ATTR_BODY =        0b00010000000000000000
+TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000
+TAG_BODY =                  0b01000000000000000000
+TAG_CLOSE =                 0b10000000000000000000
 
 
 # Global contexts:
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 99f5a7b..f65cbc1 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -426,7 +426,7 @@ class Tokenizer(object):
         reset = self._head
         self._head += 1
         try:
-            tokens = self._parse(contexts.TAG_OPEN)
+            tokens = self._parse(contexts.TAG_OPEN_NAME)
         except BadRoute:
             self._head = reset
             self._write_text("<")
@@ -438,34 +438,48 @@ class Tokenizer(object):
         if not self._stack:
             return None  # Tag has an empty name?
         text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
-        text = "".join([token.text for token in text]).strip().lower()
+        text = "".join([token.text for token in text]).rstrip().lower()
         try:
             return Tag.TRANSLATIONS[text]
         except KeyError:
             return Tag.TAG_UNKNOWN
 
     def _actually_close_tag_opening(self):
-        if self._context & contexts.TAG_ATTR:
-            if self._context & contexts.TAG_ATTR_BODY:
-                self._context ^= contexts.TAG_ATTR_BODY
-                if self._context & contexts.TAG_ATTR_BODY_QUOTED:
-                    self._context ^= contexts.TAG_ATTR_BODY_QUOTED
-            else:
-                self._context ^= contexts.TAG_ATTR_NAME
+        if self._context & contexts.TAG_OPEN_ATTR:
+            if self._context & contexts.TAG_OPEN_ATTR_NAME:
+                self._context ^= contexts.TAG_OPEN_ATTR_NAME
+            if self._context & contexts.TAG_OPEN_ATTR_BODY:
+                self._context ^= contexts.TAG_OPEN_ATTR_BODY
+                if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
+                    self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED
         else:
             tag = self._get_tag_type_from_stack()
-            if tag is None:
+            if not tag:
                 self._fail_route()
             self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
-
-        self._context ^= contexts.TAG_OPEN
+            self._context ^= contexts.TAG_OPEN_NAME
         self._context |= contexts.TAG_BODY
         padding = ""                                                                # TODO
         return padding
 
-    # def _handle_attribute(self):
-    #     if not self._context & contexts.TAG_ATTR:
-    #         self._handle_tag_close_name()
+    def _handle_tag_chunk(self, text):
+        if " " not in text:
+            self._write_text(text)
+            return
+        chunks = text.split(" ")
+        if self._context & contexts.TAG_OPEN_NAME:
+            self._write_text(chunks.pop(0))
+            tag = self._get_tag_type_from_stack()
+            if not tag:
+                self._fail_route()
+            self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
+            self._context ^= contexts.TAG_OPEN_NAME
+            self._context |= contexts.TAG_OPEN_ATTR_NAME
+            self._write(tokens.TagAttrStart())
+        for i, chunk in enumerate(chunks):
+            if i > 0:
+                self._write(tokens.TagAttrStart())
+            self._write_text(chunk)
 
     # def _handle_attribute_name(self):
     #     ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED
@@ -505,7 +519,10 @@ class Tokenizer(object):
         while True:
             this = self._read()
             if this not in self.MARKERS:
-                self._write_text(this)
+                if self._context & contexts.TAG_OPEN:
+                    self._handle_tag_chunk(this)
+                else:
+                    self._write_text(this)
                 self._head += 1
                 continue
             if this is self.END:
@@ -567,25 +584,17 @@ class Tokenizer(object):
             elif this == "<" and next != "/" and (
                     not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
                 self._parse_tag()
-            # elif this == " " and (self._context & contexts.TAG_OPEN and not
-            #                       self._context & contexts.TAG_ATTR_BODY_QUOTED):
-            #     self._handle_attribute()
-            # elif this == "=" and self._context & contexts.TAG_ATTR_NAME:
-            #     self._handle_attribute_name()
-            # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED:
-            #     self._handle_quoted_attribute_close()
-            elif self._context & contexts.TAG_OPEN and (
-                            not self._context & contexts.TAG_ATTR_BODY_QUOTED):
+            elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED):
                 if this == "\n":
                     if self._context & contexts.TAG_CLOSE:
                         self._pop()
                     self._fail_route()
                 elif this == ">":
                     self._handle_tag_close_open()
-                elif this == "/":
+                elif this == "/" and next == ">":
                     return self._handle_tag_selfclose()
-                else:
-                    self._write_text(this)
+                # elif this == "=":
+                #     self._handle_tag_attr_body()
             elif this == "<" and next == "/" and (
                                         self._context & contexts.TAG_BODY):
                 self._handle_tag_open_close()

From d459899649362773ca0db16da37bebfc1f3ce180 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 23 Dec 2012 18:38:31 -0500
Subject: [PATCH 09/77] More attribute stuff.

---
 mwparserfromhell/parser/builder.py   | 10 +++---
 mwparserfromhell/parser/tokenizer.py | 65 +++++++++++++++++++++++++-----------
 2 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 90274fa..cb5499f 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -180,9 +180,9 @@ class Builder(object):
             else:
                 self._write(self._handle_token(token))
 
-    def _handle_attribute(self):
+    def _handle_attribute(self, token):
         """Handle a case where a tag attribute is at the head of the tokens."""
-        name, quoted = None, False
+        name, quoted, padding = None, False, token.padding
         self._push()
         while self._tokens:
             token = self._tokens.pop()
@@ -195,8 +195,8 @@ class Builder(object):
                                     tokens.TagCloseOpen)):
                 self._tokens.append(token)
                 if name is not None:
-                    return Attribute(name, self._pop(), quoted)
-                return Attribute(self._pop(), quoted=quoted)
+                    return Attribute(name, self._pop(), quoted, padding)
+                return Attribute(self._pop(), quoted=quoted, padding=padding)
             else:
                 self._write(self._handle_token(token))
 
@@ -208,7 +208,7 @@ class Builder(object):
         while self._tokens:
             token = self._tokens.pop()
             if isinstance(token, tokens.TagAttrStart):
-                attrs.append(self._handle_attribute())
+                attrs.append(self._handle_attribute(token))
             elif isinstance(token, tokens.TagCloseOpen):
                 open_pad = token.padding
                 tag = self._pop()
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index f65cbc1..d3cb40f 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -450,8 +450,6 @@ class Tokenizer(object):
                 self._context ^= contexts.TAG_OPEN_ATTR_NAME
             if self._context & contexts.TAG_OPEN_ATTR_BODY:
                 self._context ^= contexts.TAG_OPEN_ATTR_BODY
-                if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
-                    self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED
         else:
             tag = self._get_tag_type_from_stack()
             if not tag:
@@ -462,6 +460,20 @@ class Tokenizer(object):
         padding = ""                                                                # TODO
         return padding
 
+    def _actually_handle_chunk(self, chunks, is_new):
+        if is_new:
+            padding = 0
+            while chunks:
+                if chunks[0] == "":
+                    padding += 1
+                    chunks.pop(0)
+                else:
+                    break
+            self._write(tokens.TagAttrStart(padding=" " * padding))
+        if chunks:
+            chunk = chunks.pop(0)
+            self._write_text(chunk)
+
     def _handle_tag_chunk(self, text):
         if " " not in text:
             self._write_text(text)
@@ -475,18 +487,29 @@ class Tokenizer(object):
             self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
             self._context ^= contexts.TAG_OPEN_NAME
             self._context |= contexts.TAG_OPEN_ATTR_NAME
-            self._write(tokens.TagAttrStart())
-        for i, chunk in enumerate(chunks):
-            if i > 0:
-                self._write(tokens.TagAttrStart())
-            self._write_text(chunk)
-
-    # def _handle_attribute_name(self):
-    #     ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED
-    #     pass
-
-    # def _handle_quoted_attribute_close(self):
-    #     pass
+            self._actually_handle_chunk(chunks, True)
+        is_new = False
+        while chunks:
+            self._actually_handle_chunk(chunks, is_new)
+            is_new = True
+
+    def _handle_tag_attribute_body(self):
+        self._context ^= contexts.TAG_OPEN_ATTR_NAME
+        self._context |= contexts.TAG_OPEN_ATTR_BODY
+        self._write(TagAttrEquals())
+        next = self._read(1)
+        if next not in self.MARKERS and next.startswith('"'):
+            if re.search(r'[^\\]"$', next[1:]):
+                if not re.search(r'[^\\]"', next[1:-1]):
+                    self._write(TagAttrQuote())
+                    self._write_text(next[1:-1])
+                    self._head += 1
+            else:
+                if not re.search(r'[^\\]"', next[1:]):
+                    self._push(contexts.TAG_OPEN_ATTR_BODY_QUOTED)
+                    self._write(TagAttrQuote())
+                    self._write_text(next[1:])
+                    self._head += 1
 
     def _handle_tag_close_open(self):
         padding = self._actually_close_tag_opening()
@@ -526,10 +549,12 @@ class Tokenizer(object):
                 self._head += 1
                 continue
             if this is self.END:
-                fail = (contexts.TEMPLATE | contexts.ARGUMENT |
-                        contexts.WIKILINK | contexts.HEADING |
-                        contexts.COMMENT | contexts.TAG)
-                double_fail = contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE
+                fail = (
+                    contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
+                    contexts.HEADING | contexts.COMMENT | contexts.TAG)
+                double_fail = (
+                    contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
+                    contexts.TAG_OPEN_ATTR_BODY_QUOTED)
                 if self._context & double_fail:
                     self._pop()
                 if self._context & fail:
@@ -593,8 +618,8 @@ class Tokenizer(object):
                     self._handle_tag_close_open()
                 elif this == "/" and next == ">":
                     return self._handle_tag_selfclose()
-                # elif this == "=":
-                #     self._handle_tag_attr_body()
+                elif this == "=":
+                    self._handle_tag_attribute_body()
             elif this == "<" and next == "/" and (
                                         self._context & contexts.TAG_BODY):
                 self._handle_tag_open_close()

From 26d30f3d1a8c0caca854f7040d07555c6f794b0f Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 23 Dec 2012 19:18:09 -0500
Subject: [PATCH 10/77] Seems to be working for quoted attributes now.

---
 mwparserfromhell/parser/tokenizer.py | 40 ++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index d3cb40f..920d1cf 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -461,7 +461,7 @@ class Tokenizer(object):
         return padding
 
     def _actually_handle_chunk(self, chunks, is_new):
-        if is_new:
+        if is_new and not self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
             padding = 0
             while chunks:
                 if chunks[0] == "":
@@ -472,6 +472,15 @@ class Tokenizer(object):
             self._write(tokens.TagAttrStart(padding=" " * padding))
         if chunks:
             chunk = chunks.pop(0)
+            if self._context & contexts.TAG_OPEN_ATTR_BODY:
+                self._context ^= contexts.TAG_OPEN_ATTR_BODY
+                self._context |= contexts.TAG_OPEN_ATTR_NAME
+            if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
+                if re.search(r'[^\\]"', chunk[:-1]):
+                    self._fail_route()
+                if re.search(r'[^\\]"$', chunk):
+                    self._write_text(chunk[:-1])
+                    return self._pop()  # Back to _handle_tag_attribute_body()
             self._write_text(chunk)
 
     def _handle_tag_chunk(self, text):
@@ -490,26 +499,35 @@ class Tokenizer(object):
             self._actually_handle_chunk(chunks, True)
         is_new = False
         while chunks:
-            self._actually_handle_chunk(chunks, is_new)
+            should_exit = self._actually_handle_chunk(chunks, is_new)
+            if should_exit:
+                return should_exit
             is_new = True
 
     def _handle_tag_attribute_body(self):
         self._context ^= contexts.TAG_OPEN_ATTR_NAME
         self._context |= contexts.TAG_OPEN_ATTR_BODY
-        self._write(TagAttrEquals())
+        self._write(tokens.TagAttrEquals())
         next = self._read(1)
         if next not in self.MARKERS and next.startswith('"'):
             if re.search(r'[^\\]"$', next[1:]):
                 if not re.search(r'[^\\]"', next[1:-1]):
-                    self._write(TagAttrQuote())
+                    self._write(tokens.TagAttrQuote())
                     self._write_text(next[1:-1])
                     self._head += 1
             else:
                 if not re.search(r'[^\\]"', next[1:]):
-                    self._push(contexts.TAG_OPEN_ATTR_BODY_QUOTED)
-                    self._write(TagAttrQuote())
-                    self._write_text(next[1:])
                     self._head += 1
+                    reset = self._head
+                    try:
+                        attr = self._parse(contexts.TAG_OPEN_ATTR_BODY_QUOTED)
+                    except BadRoute:
+                        self._head = reset
+                        self._write_text(next)
+                    else:
+                        self._write(tokens.TagAttrQuote())
+                        self._write_text(next[1:])
+                        self._write_all(attr)
 
     def _handle_tag_close_open(self):
         padding = self._actually_close_tag_opening()
@@ -543,7 +561,9 @@ class Tokenizer(object):
             this = self._read()
             if this not in self.MARKERS:
                 if self._context & contexts.TAG_OPEN:
-                    self._handle_tag_chunk(this)
+                    should_exit = self._handle_tag_chunk(this)
+                    if should_exit:
+                        return should_exit
                 else:
                     self._write_text(this)
                 self._head += 1
@@ -593,6 +613,8 @@ class Tokenizer(object):
             elif this == "=" and not self._global & contexts.GL_HEADING:
                 if self._read(-1) in ("\n", self.START):
                     self._parse_heading()
+                elif self._context & contexts.TAG_OPEN_ATTR_NAME:
+                    self._handle_tag_attribute_body()
                 else:
                     self._write_text("=")
             elif this == "=" and self._context & contexts.HEADING:
@@ -618,7 +640,7 @@ class Tokenizer(object):
                     self._handle_tag_close_open()
                 elif this == "/" and next == ">":
                     return self._handle_tag_selfclose()
-                elif this == "=":
+                elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
                     self._handle_tag_attribute_body()
             elif this == "<" and next == "/" and (
                                         self._context & contexts.TAG_BODY):

From ca47305074aa04585d29dd91f346079e57156f53 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 23 Dec 2012 21:35:48 -0500
Subject: [PATCH 11/77] Fix attribute behavior under certain strange
 circumstances.

---
 mwparserfromhell/parser/tokenizer.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 920d1cf..46c4399 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -470,6 +470,7 @@ class Tokenizer(object):
                 else:
                     break
             self._write(tokens.TagAttrStart(padding=" " * padding))
+
         if chunks:
             chunk = chunks.pop(0)
             if self._context & contexts.TAG_OPEN_ATTR_BODY:
@@ -480,7 +481,9 @@ class Tokenizer(object):
                     self._fail_route()
                 if re.search(r'[^\\]"$', chunk):
                     self._write_text(chunk[:-1])
-                    return self._pop()  # Back to _handle_tag_attribute_body()
+                    self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED
+                    self._context |= contexts.TAG_OPEN_ATTR_NAME
+                    return True  # Back to _handle_tag_attribute_body()
             self._write_text(chunk)
 
     def _handle_tag_chunk(self, text):
@@ -497,12 +500,15 @@ class Tokenizer(object):
             self._context ^= contexts.TAG_OPEN_NAME
             self._context |= contexts.TAG_OPEN_ATTR_NAME
             self._actually_handle_chunk(chunks, True)
+
         is_new = False
+        is_quoted = False
         while chunks:
-            should_exit = self._actually_handle_chunk(chunks, is_new)
-            if should_exit:
-                return should_exit
+            result = self._actually_handle_chunk(chunks, is_new)
+            is_quoted = result or is_quoted
             is_new = True
+        if is_quoted:
+            return self._pop()
 
     def _handle_tag_attribute_body(self):
         self._context ^= contexts.TAG_OPEN_ATTR_NAME
@@ -510,6 +516,10 @@ class Tokenizer(object):
         self._write(tokens.TagAttrEquals())
         next = self._read(1)
         if next not in self.MARKERS and next.startswith('"'):
+            chunks = None
+            if " " in next:
+                chunks = next.split(" ")
+                next = chunks.pop(0)
             if re.search(r'[^\\]"$', next[1:]):
                 if not re.search(r'[^\\]"', next[1:-1]):
                     self._write(tokens.TagAttrQuote())
@@ -528,6 +538,10 @@ class Tokenizer(object):
                         self._write(tokens.TagAttrQuote())
                         self._write_text(next[1:])
                         self._write_all(attr)
+            self._context ^= contexts.TAG_OPEN_ATTR_BODY
+            self._context |= contexts.TAG_OPEN_ATTR_NAME
+            while chunks:
+                self._actually_handle_chunk(chunks, True)
 
     def _handle_tag_close_open(self):
         padding = self._actually_close_tag_opening()

From 146d1fd006c32b4a71312cd966c3e124592bce92 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 23 Dec 2012 21:44:56 -0500
Subject: [PATCH 12/77] Fix a bug in rendering Tags; attrs->attributes; update
 documentation.

---
 docs/api/mwparserfromhell.nodes.rst | 1 +
 docs/api/mwparserfromhell.rst       | 6 ++++++
 mwparserfromhell/nodes/tag.py       | 4 ++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/api/mwparserfromhell.nodes.rst b/docs/api/mwparserfromhell.nodes.rst
index d1016f9..a093c17 100644
--- a/docs/api/mwparserfromhell.nodes.rst
+++ b/docs/api/mwparserfromhell.nodes.rst
@@ -46,6 +46,7 @@ nodes Package
 
 .. automodule:: mwparserfromhell.nodes.tag
     :members:
+    :undoc-members:
     :show-inheritance:
 
 :mod:`template` Module
diff --git a/docs/api/mwparserfromhell.rst b/docs/api/mwparserfromhell.rst
index 3ca09c9..b682139 100644
--- a/docs/api/mwparserfromhell.rst
+++ b/docs/api/mwparserfromhell.rst
@@ -30,6 +30,12 @@ mwparserfromhell Package
     :members:
     :undoc-members:
 
+:mod:`tag_defs` Module
+----------------------
+
+.. automodule:: mwparserfromhell.tag_defs
+    :members:
+
 :mod:`utils` Module
 -------------------
 
diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index ea98bb6..833b597 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -65,7 +65,7 @@ class Tag(TagDefinitions, Node):
             result += self.open_padding + "/>"
         else:
             result += self.open_padding + ">" + str(self.contents)
-            result += "</" + self.closing_tag + ">"
+            result += "</" + str(self.closing_tag) + ">"
         return result
 
     def __iternodes__(self, getter):
@@ -126,7 +126,7 @@ class Tag(TagDefinitions, Node):
         return self._contents
 
     @property
-    def attrs(self):
+    def attributes(self):
         """The list of attributes affecting the tag.
 
         Each attribute is an instance of :py:class:`~.Attribute`.

From a58c480639119b2cd3c78eee8dfe0893fa6360fc Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 23 Dec 2012 22:23:31 -0500
Subject: [PATCH 13/77] Fix some usage of attrs; shorten a context, fix some
 behavior I broke.

---
 mwparserfromhell/nodes/tag.py        | 11 +++---
 mwparserfromhell/parser/contexts.py  | 68 +++++++++++++++++++-----------------
 mwparserfromhell/parser/tokenizer.py | 29 +++++++++------
 3 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index 833b597..94f92c5 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -59,8 +59,8 @@ class Tag(TagDefinitions, Node):
                 return open_ + str(self.contents) + close
 
         result = "<" + str(self.tag)
-        if self.attrs:
-            result += " " + " ".join([str(attr) for attr in self.attrs])
+        if self.attributes:
+            result += " " + " ".join([str(attr) for attr in self.attributes])
         if self.self_closing:
             result += self.open_padding + "/>"
         else:
@@ -73,7 +73,7 @@ class Tag(TagDefinitions, Node):
         if self.showtag:
             for child in getter(self.tag):
                 yield self.tag, child
-            for attr in self.attrs:
+            for attr in self.attributes:
                 for child in getter(attr.name):
                     yield attr.name, child
                 if attr.value:
@@ -89,12 +89,13 @@ class Tag(TagDefinitions, Node):
 
     def __showtree__(self, write, get, mark):
         tagnodes = self.tag.nodes
-        if (not self.attrs and len(tagnodes) == 1 and isinstance(tagnodes[0], Text)):
+        if not self.attributes and (len(tagnodes) == 1 and
+                                    isinstance(tagnodes[0], Text)):
             write("<" + str(tagnodes[0]) + ">")
         else:
             write("<")
             get(self.tag)
-            for attr in self.attrs:
+            for attr in self.attributes:
                 get(attr.name)
                 if not attr.value:
                     continue
diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index 053c930..d87da9a 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -71,7 +71,8 @@ Local (stack-specific) contexts:
 
             * :py:const:`TAG_OPEN_ATTR_NAME`
             * :py:const:`TAG_OPEN_ATTR_BODY`
-            * :py:const:`TAG_OPEN_ATTR_BODY_QUOTED`
+            * :py:const:`TAG_OPEN_ATTR_QUOTED`
+            * :py:const:`TAG_OPEN_ATTR_IGNORE`
 
     * :py:const:`TAG_BODY`
     * :py:const:`TAG_CLOSE`
@@ -83,38 +84,39 @@ Global contexts:
 
 # Local contexts:
 
-TEMPLATE =                  0b00000000000000000111
-TEMPLATE_NAME =             0b00000000000000000001
-TEMPLATE_PARAM_KEY =        0b00000000000000000010
-TEMPLATE_PARAM_VALUE =      0b00000000000000000100
-
-ARGUMENT =                  0b00000000000000011000
-ARGUMENT_NAME =             0b00000000000000001000
-ARGUMENT_DEFAULT =          0b00000000000000010000
-
-WIKILINK =                  0b00000000000001100000
-WIKILINK_TITLE =            0b00000000000000100000
-WIKILINK_TEXT =             0b00000000000001000000
-
-HEADING =                   0b00000001111110000000
-HEADING_LEVEL_1 =           0b00000000000010000000
-HEADING_LEVEL_2 =           0b00000000000100000000
-HEADING_LEVEL_3 =           0b00000000001000000000
-HEADING_LEVEL_4 =           0b00000000010000000000
-HEADING_LEVEL_5 =           0b00000000100000000000
-HEADING_LEVEL_6 =           0b00000001000000000000
-
-COMMENT =                   0b00000010000000000000
-
-TAG =                       0b11111100000000000000
-TAG_OPEN =                  0b00111100000000000000
-TAG_OPEN_NAME =             0b00000100000000000000
-TAG_OPEN_ATTR =             0b00111000000000000000
-TAG_OPEN_ATTR_NAME =        0b00001000000000000000
-TAG_OPEN_ATTR_BODY =        0b00010000000000000000
-TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000
-TAG_BODY =                  0b01000000000000000000
-TAG_CLOSE =                 0b10000000000000000000
+TEMPLATE =             0b000000000000000000111
+TEMPLATE_NAME =        0b000000000000000000001
+TEMPLATE_PARAM_KEY =   0b000000000000000000010
+TEMPLATE_PARAM_VALUE = 0b000000000000000000100
+
+ARGUMENT =             0b000000000000000011000
+ARGUMENT_NAME =        0b000000000000000001000
+ARGUMENT_DEFAULT =     0b000000000000000010000
+
+WIKILINK =             0b000000000000001100000
+WIKILINK_TITLE =       0b000000000000000100000
+WIKILINK_TEXT =        0b000000000000001000000
+
+HEADING =              0b000000001111110000000
+HEADING_LEVEL_1 =      0b000000000000010000000
+HEADING_LEVEL_2 =      0b000000000000100000000
+HEADING_LEVEL_3 =      0b000000000001000000000
+HEADING_LEVEL_4 =      0b000000000010000000000
+HEADING_LEVEL_5 =      0b000000000100000000000
+HEADING_LEVEL_6 =      0b000000001000000000000
+
+COMMENT =              0b000000010000000000000
+
+TAG =                  0b111111100000000000000
+TAG_OPEN =             0b001111100000000000000
+TAG_OPEN_NAME =        0b000000100000000000000
+TAG_OPEN_ATTR =        0b001111000000000000000
+TAG_OPEN_ATTR_NAME =   0b000001000000000000000
+TAG_OPEN_ATTR_BODY =   0b000010000000000000000
+TAG_OPEN_ATTR_QUOTED = 0b000100000000000000000
+TAG_OPEN_ATTR_IGNORE = 0b001000000000000000000
+TAG_BODY =             0b010000000000000000000
+TAG_CLOSE =            0b100000000000000000000
 
 
 # Global contexts:
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 46c4399..1d31fa4 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -457,11 +457,13 @@ class Tokenizer(object):
             self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
             self._context ^= contexts.TAG_OPEN_NAME
         self._context |= contexts.TAG_BODY
-        padding = ""                                                                # TODO
+
+        ## If the last element was TagAttrStart, remove it, add " " to its padding, then return that
+        padding = ""
         return padding
 
     def _actually_handle_chunk(self, chunks, is_new):
-        if is_new and not self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
+        if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
             padding = 0
             while chunks:
                 if chunks[0] == "":
@@ -470,18 +472,24 @@ class Tokenizer(object):
                 else:
                     break
             self._write(tokens.TagAttrStart(padding=" " * padding))
+        elif self._context & contexts.TAG_OPEN_ATTR_IGNORE:
+            self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
+            chunks.pop(0)
+            return
+        elif self._context & contexts.TAG_OPEN_ATTR_QUOTED:
+            self._write_text(" ")  # Quoted chunks don't lose their spaces
 
         if chunks:
             chunk = chunks.pop(0)
             if self._context & contexts.TAG_OPEN_ATTR_BODY:
                 self._context ^= contexts.TAG_OPEN_ATTR_BODY
                 self._context |= contexts.TAG_OPEN_ATTR_NAME
-            if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
+            if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
                 if re.search(r'[^\\]"', chunk[:-1]):
                     self._fail_route()
                 if re.search(r'[^\\]"$', chunk):
                     self._write_text(chunk[:-1])
-                    self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED
+                    self._context ^= contexts.TAG_OPEN_ATTR_QUOTED
                     self._context |= contexts.TAG_OPEN_ATTR_NAME
                     return True  # Back to _handle_tag_attribute_body()
             self._write_text(chunk)
@@ -491,6 +499,8 @@ class Tokenizer(object):
             self._write_text(text)
             return
         chunks = text.split(" ")
+        is_new = False
+        is_quoted = False
         if self._context & contexts.TAG_OPEN_NAME:
             self._write_text(chunks.pop(0))
             tag = self._get_tag_type_from_stack()
@@ -500,9 +510,7 @@ class Tokenizer(object):
             self._context ^= contexts.TAG_OPEN_NAME
             self._context |= contexts.TAG_OPEN_ATTR_NAME
             self._actually_handle_chunk(chunks, True)
-
-        is_new = False
-        is_quoted = False
+            is_new = True
         while chunks:
             result = self._actually_handle_chunk(chunks, is_new)
             is_quoted = result or is_quoted
@@ -530,7 +538,7 @@ class Tokenizer(object):
                     self._head += 1
                     reset = self._head
                     try:
-                        attr = self._parse(contexts.TAG_OPEN_ATTR_BODY_QUOTED)
+                        attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | contexts.TAG_OPEN_ATTR_IGNORE)
                     except BadRoute:
                         self._head = reset
                         self._write_text(next)
@@ -538,6 +546,7 @@ class Tokenizer(object):
                         self._write(tokens.TagAttrQuote())
                         self._write_text(next[1:])
                         self._write_all(attr)
+                        return
             self._context ^= contexts.TAG_OPEN_ATTR_BODY
             self._context |= contexts.TAG_OPEN_ATTR_NAME
             while chunks:
@@ -588,7 +597,7 @@ class Tokenizer(object):
                     contexts.HEADING | contexts.COMMENT | contexts.TAG)
                 double_fail = (
                     contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
-                    contexts.TAG_OPEN_ATTR_BODY_QUOTED)
+                    contexts.TAG_OPEN_ATTR_QUOTED)
                 if self._context & double_fail:
                     self._pop()
                 if self._context & fail:
@@ -645,7 +654,7 @@ class Tokenizer(object):
             elif this == "<" and next != "/" and (
                     not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
                 self._parse_tag()
-            elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED):
+            elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED):
                 if this == "\n":
                     if self._context & contexts.TAG_CLOSE:
                         self._pop()

From eed7c918bfb0741fefd0473f61bbc1e9343ad033 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 23 Dec 2012 22:41:32 -0500
Subject: [PATCH 14/77] Implement padding support for Tags completely;
 open_padding->padding.

---
 mwparserfromhell/nodes/tag.py        | 18 +++++++++---------
 mwparserfromhell/parser/builder.py   |  6 +++---
 mwparserfromhell/parser/tokenizer.py | 15 +++++++++------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index 94f92c5..ecf6f2b 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -33,7 +33,7 @@ class Tag(TagDefinitions, Node):
     """Represents an HTML-style tag in wikicode, like ``<ref>``."""
 
     def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
-                 self_closing=False, open_padding="", closing_tag=None):
+                 self_closing=False, padding="", closing_tag=None):
         super(Tag, self).__init__()
         self._type = type_
         self._tag = tag
@@ -44,7 +44,7 @@ class Tag(TagDefinitions, Node):
             self._attrs = []
         self._showtag = showtag
         self._self_closing = self_closing
-        self._open_padding = open_padding
+        self._padding = padding
         if closing_tag:
             self._closing_tag = closing_tag
         else:
@@ -62,9 +62,9 @@ class Tag(TagDefinitions, Node):
         if self.attributes:
             result += " " + " ".join([str(attr) for attr in self.attributes])
         if self.self_closing:
-            result += self.open_padding + "/>"
+            result += self.padding + "/>"
         else:
-            result += self.open_padding + ">" + str(self.contents)
+            result += self.padding + ">" + str(self.contents)
             result += "</" + str(self.closing_tag) + ">"
         return result
 
@@ -145,9 +145,9 @@ class Tag(TagDefinitions, Node):
         return self._self_closing
 
     @property
-    def open_padding(self):
+    def padding(self):
         """Spacing to insert before the first closing ``>``."""
-        return self._open_padding
+        return self._padding
 
     @property
     def closing_tag(self):
@@ -188,9 +188,9 @@ class Tag(TagDefinitions, Node):
     def self_closing(self, value):
         self._self_closing = bool(value)
 
-    @open_padding.setter
-    def open_padding(self, value):
-        self._open_padding = str(value)
+    @padding.setter
+    def padding(self, value):
+        self._padding = str(value)
 
     @closing_tag.setter
     def closing_tag(self, value):
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index cb5499f..2d9ea55 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -210,19 +210,19 @@ class Builder(object):
             if isinstance(token, tokens.TagAttrStart):
                 attrs.append(self._handle_attribute(token))
             elif isinstance(token, tokens.TagCloseOpen):
-                open_pad = token.padding
+                padding = token.padding
                 tag = self._pop()
                 self._push()
             elif isinstance(token, tokens.TagCloseSelfclose):
                 tag = self._pop()
                 return Tag(type_, tag, attrs=attrs, showtag=showtag,
-                           self_closing=True, open_padding=token.padding)
+                           self_closing=True, padding=token.padding)
             elif isinstance(token, tokens.TagOpenClose):
                 contents = self._pop()
                 self._push()
             elif isinstance(token, tokens.TagCloseClose):
                 return Tag(type_, tag, contents, attrs, showtag, False,
-                           open_pad, self._pop())
+                           padding, self._pop())
             else:
                 self._write(self._handle_token(token))
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 1d31fa4..901e731 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -458,9 +458,9 @@ class Tokenizer(object):
             self._context ^= contexts.TAG_OPEN_NAME
         self._context |= contexts.TAG_BODY
 
-        ## If the last element was TagAttrStart, remove it, add " " to its padding, then return that
-        padding = ""
-        return padding
+        if isinstance(self._stack[-1], tokens.TagAttrStart):
+            return self._stack.pop().padding
+        return ""
 
     def _actually_handle_chunk(self, chunks, is_new):
         if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
@@ -538,7 +538,8 @@ class Tokenizer(object):
                     self._head += 1
                     reset = self._head
                     try:
-                        attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | contexts.TAG_OPEN_ATTR_IGNORE)
+                        attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED |
+                                           contexts.TAG_OPEN_ATTR_IGNORE)
                     except BadRoute:
                         self._head = reset
                         self._write_text(next)
@@ -654,7 +655,8 @@ class Tokenizer(object):
             elif this == "<" and next != "/" and (
                     not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
                 self._parse_tag()
-            elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED):
+            elif self._context & (
+                            contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED):
                 if this == "\n":
                     if self._context & contexts.TAG_CLOSE:
                         self._pop()
@@ -663,7 +665,8 @@ class Tokenizer(object):
                     self._handle_tag_close_open()
                 elif this == "/" and next == ">":
                     return self._handle_tag_selfclose()
-                elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
+                elif this == "=" and (
+                                self._context & contexts.TAG_OPEN_ATTR_NAME):
                     self._handle_tag_attribute_body()
             elif this == "<" and next == "/" and (
                                         self._context & contexts.TAG_BODY):

From 6ea618460fc122dcd60ebebd0ecf02a36f82d8cf Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Mon, 31 Dec 2012 03:19:22 -0500
Subject: [PATCH 15/77] _get_tag_type_from_stack() makes more sense now

---
 mwparserfromhell/parser/tokenizer.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 901e731..e83ec5d 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -433,16 +433,18 @@ class Tokenizer(object):
         else:
             self._write_all(tokens)
 
-    def _get_tag_type_from_stack(self):
-        self._push_textbuffer()
-        if not self._stack:
-            return None  # Tag has an empty name?
-        text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
+    def _get_tag_type_from_stack(self, stack=None):
+        if stack is None:
+            stack = self._stack
+            self._push_textbuffer()
+        if not stack:
+            self._fail_route()  # Tag has an empty name?
+        text = [tok for tok in stack if isinstance(tok, tokens.Text)]
         text = "".join([token.text for token in text]).rstrip().lower()
         try:
             return Tag.TRANSLATIONS[text]
         except KeyError:
-            return Tag.TAG_UNKNOWN
+            self._fail_route()
 
     def _actually_close_tag_opening(self):
         if self._context & contexts.TAG_OPEN_ATTR:
@@ -452,8 +454,6 @@ class Tokenizer(object):
                 self._context ^= contexts.TAG_OPEN_ATTR_BODY
         else:
             tag = self._get_tag_type_from_stack()
-            if not tag:
-                self._fail_route()
             self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
             self._context ^= contexts.TAG_OPEN_NAME
         self._context |= contexts.TAG_BODY
@@ -504,8 +504,6 @@ class Tokenizer(object):
         if self._context & contexts.TAG_OPEN_NAME:
             self._write_text(chunks.pop(0))
             tag = self._get_tag_type_from_stack()
-            if not tag:
-                self._fail_route()
             self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
             self._context ^= contexts.TAG_OPEN_NAME
             self._context |= contexts.TAG_OPEN_ATTR_NAME
@@ -569,8 +567,8 @@ class Tokenizer(object):
         self._head += 1
 
     def _handle_tag_close_close(self):
-        tag = self._get_tag_type_from_stack()
         closing = self._pop()
+        tag = self._get_tag_type_from_stack(closing)
         if tag != self._stack[0].type:
             # Closing and opening tags are not the same, so fail this route:
             self._fail_route()

From 0ee505b5a506cfc1c0530935bb01933b94aa14dc Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 24 Jan 2013 01:24:06 -0500
Subject: [PATCH 16/77] Docstrings for new tokenizer methods.

---
 mwparserfromhell/parser/tokenizer.py | 41 ++++++++++++++++++++++++++++++------
 mwparserfromhell/tag_defs.py         |  2 +-
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index e83ec5d..8ec3355 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -434,6 +434,10 @@ class Tokenizer(object):
             self._write_all(tokens)
 
     def _get_tag_type_from_stack(self, stack=None):
+        """Return the tag type based on the text in *stack*.
+
+        If *stack* is ``None``, we will use the current, topmost one.
+        """
         if stack is None:
             stack = self._stack
             self._push_textbuffer()
@@ -447,6 +451,13 @@ class Tokenizer(object):
             self._fail_route()
 
     def _actually_close_tag_opening(self):
+        """Handle cleanup at the end of a opening tag.
+
+        The current context will be updated and the
+        :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the
+        opening tag's padding to be used in the
+        :py:class:`~.tokens.TagOpenClose` token.
+        """
         if self._context & contexts.TAG_OPEN_ATTR:
             if self._context & contexts.TAG_OPEN_ATTR_NAME:
                 self._context ^= contexts.TAG_OPEN_ATTR_NAME
@@ -463,6 +474,11 @@ class Tokenizer(object):
         return ""
 
     def _actually_handle_chunk(self, chunks, is_new):
+        """Actually handle a chunk of code within a tag's attributes.
+
+        Called by :py:meth:`_handle_tag_chunk` and
+        :py:meth:`_handle_tag_attribute_body`.
+        """
         if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
             padding = 0
             while chunks:
@@ -495,6 +511,12 @@ class Tokenizer(object):
             self._write_text(chunk)
 
     def _handle_tag_chunk(self, text):
+        """Handle a chunk of code within a tag's attributes.
+
+        This is called by :py:meth:`_parse`, which intercepts parsing of
+        wikicode when we're inside of an opening tag and no :py:attr:`MARKERS`
+        are present.
+        """
         if " " not in text:
             self._write_text(text)
             return
@@ -517,6 +539,12 @@ class Tokenizer(object):
             return self._pop()
 
     def _handle_tag_attribute_body(self):
+        """Handle the body, or value, of a tag attribute.
+
+        Attribute bodies can usually be handled at once, but sometimes a new
+        stack must be created to keep track of "rich" attribute values that
+        contain, for example, templates.
+        """
         self._context ^= contexts.TAG_OPEN_ATTR_NAME
         self._context |= contexts.TAG_OPEN_ATTR_BODY
         self._write(tokens.TagAttrEquals())
@@ -552,21 +580,25 @@ class Tokenizer(object):
                 self._actually_handle_chunk(chunks, True)
 
     def _handle_tag_close_open(self):
+        """Handle the ending of an open tag (``<foo>``)."""
         padding = self._actually_close_tag_opening()
         self._write(tokens.TagCloseOpen(padding=padding))
 
     def _handle_tag_selfclose(self):
+        """Handle the ending of an tag that closes itself (``<foo />``)."""
         padding = self._actually_close_tag_opening()
         self._write(tokens.TagCloseSelfclose(padding=padding))
         self._head += 1
         return self._pop()
 
     def _handle_tag_open_close(self):
+        """Handle the opening of a closing tag (``</foo>``)."""
         self._write(tokens.TagOpenClose())
         self._push(contexts.TAG_CLOSE)
         self._head += 1
 
     def _handle_tag_close_close(self):
+        """Handle the ending of a closing tag (``</foo>``)."""
         closing = self._pop()
         tag = self._get_tag_type_from_stack(closing)
         if tag != self._stack[0].type:
@@ -653,8 +685,7 @@ class Tokenizer(object):
             elif this == "<" and next != "/" and (
                     not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
                 self._parse_tag()
-            elif self._context & (
-                            contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED):
+            elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED):
                 if this == "\n":
                     if self._context & contexts.TAG_CLOSE:
                         self._pop()
@@ -663,11 +694,9 @@ class Tokenizer(object):
                     self._handle_tag_close_open()
                 elif this == "/" and next == ">":
                     return self._handle_tag_selfclose()
-                elif this == "=" and (
-                                self._context & contexts.TAG_OPEN_ATTR_NAME):
+                elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
                     self._handle_tag_attribute_body()
-            elif this == "<" and next == "/" and (
-                                        self._context & contexts.TAG_BODY):
+            elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
                 self._handle_tag_open_close()
             elif this == ">" and self._context & contexts.TAG_CLOSE:
                 return self._handle_tag_close_close()
diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py
index 74d3a81..b2ee90d 100644
--- a/mwparserfromhell/tag_defs.py
+++ b/mwparserfromhell/tag_defs.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
-# Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
+# Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

From d8814968b71fdd9ceea22085c19d43b69101ba38 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 14 Mar 2013 11:02:10 -0400
Subject: [PATCH 17/77] Applying latest commit from develop

---
 mwparserfromhell/parser/__init__.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py
index 5baa687..fd8a314 100644
--- a/mwparserfromhell/parser/__init__.py
+++ b/mwparserfromhell/parser/__init__.py
@@ -26,16 +26,16 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module
 joins them together under one interface.
 """
 
+from .builder import Builder
+from .tokenizer import Tokenizer
 try:
-    from ._builder import CBuilder as Builder
+    from ._tokenizer import CTokenizer
+    use_c = True
 except ImportError:
-    from .builder import Builder
-try:
-    from ._tokenizer import CTokenizer as Tokenizer
-except ImportError:
-    from .tokenizer import Tokenizer
+    CTokenizer = None
+    use_c = False
 
-__all__ = ["Parser"]
+__all__ = ["use_c", "Parser"]
 
 class Parser(object):
     """Represents a parser for wikicode.
@@ -48,7 +48,10 @@ class Parser(object):
 
     def __init__(self, text):
         self.text = text
-        self._tokenizer = Tokenizer()
+        if use_c and CTokenizer:
+            self._tokenizer = CTokenizer()
+        else:
+            self._tokenizer = Tokenizer()
         self._builder = Builder()
 
     def parse(self):

From 61fc5b5eab7dbe9c0466fd07a656c8490d8d04ad Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 19 May 2013 14:41:48 -0400
Subject: [PATCH 18/77] Fix handling of self-closing tags (closes #31)

---
 mwparserfromhell/nodes/tag.py        | 5 +++--
 mwparserfromhell/parser/builder.py   | 4 ++--
 mwparserfromhell/parser/tokenizer.py | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index eb5d1ee..d301d85 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -79,8 +79,9 @@ class Tag(TagDefinitions, Node):
                 if attr.value:
                     for child in getter(attr.value):
                         yield attr.value, child
-        for child in getter(self.contents):
-            yield self.contents, child
+        if self.contents:
+            for child in getter(self.contents):
+                yield self.contents, child
 
     def __strip__(self, normalize, collapse):
         if self.type in self.TAGS_VISIBLE:
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 60bfaa9..4b468b7 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -191,8 +191,8 @@ class Builder(object):
                 self._push()
             elif isinstance(token, tokens.TagAttrQuote):
                 quoted = True
-            elif isinstance(token, (tokens.TagAttrStart,
-                                    tokens.TagCloseOpen)):
+            elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
+                                    tokens.TagCloseSelfclose)):
                 self._tokens.append(token)
                 if name is not None:
                     return Attribute(name, self._pop(), quoted, padding)
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 82f748c..b466de5 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -26,8 +26,8 @@ import re
 
 from . import contexts
 from . import tokens
-from ..nodes.tag import Tag
 from ..compat import htmlentities
+from ..nodes.tag import Tag
 
 __all__ = ["Tokenizer"]
 
@@ -431,7 +431,7 @@ class Tokenizer(object):
         try:
             return Tag.TRANSLATIONS[text]
         except KeyError:
-            self._fail_route()
+            return Tag.TAG_UNKNOWN
 
     def _actually_close_tag_opening(self):
         """Handle cleanup at the end of a opening tag.

From 1b4c01b4c00d014499d9f5e5ad8ecc01bb20a2b7 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Mon, 20 May 2013 03:05:11 -0400
Subject: [PATCH 19/77] Implement assertTagNodeEqual(), start test_tag(), add
 to tags.mwtest.

---
 mwparserfromhell/parser/builder.py |  2 +-
 tests/_test_tree_equality.py       | 19 +++++++-
 tests/test_attribute.py            |  0
 tests/test_builder.py              | 12 +++++-
 tests/test_tag.py                  |  0
 tests/tokenizer/tags.mwtest        | 88 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 117 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_attribute.py
 create mode 100644 tests/test_tag.py
 create mode 100644 tests/tokenizer/tags.mwtest

diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 4b468b7..5ec0780 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -170,7 +170,7 @@ class Builder(object):
                 self._write(self._handle_token(token))
 
     def _handle_comment(self):
-        """Handle a case where a hidden comment is at the head of the tokens."""
+        """Handle a case where an HTML comment is at the head of the tokens."""
         self._push()
         while self._tokens:
             token = self._tokens.pop()
diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py
index 52130ed..2828147 100644
--- a/tests/_test_tree_equality.py
+++ b/tests/_test_tree_equality.py
@@ -91,7 +91,24 @@ class TreeEqualityTestCase(TestCase):
 
     def assertTagNodeEqual(self, expected, actual):
         """Assert that two Tag nodes have the same data."""
-        self.fail("Holding this until feature/html_tags is ready.")
+        self.assertEqual(expected.type, actual.type)
+        self.assertWikicodeEqual(expected.tag, actual.tag)
+        if expected.contents is not None:
+            self.assertWikicodeEqual(expected.contents, actual.contents)
+        length = len(expected.attributes)
+        self.assertEqual(length, len(actual.attributes))
+        for i in range(length):
+            exp_attr = expected.attributes[i]
+            act_attr = actual.attributes[i]
+            self.assertWikicodeEqual(exp_attr.name, act_attr.name)
+            if exp_attr.value is not None:
+                self.assertWikicodeEqual(exp_attr.value, act_attr.value)
+            self.assertIs(exp_attr.quoted, act_attr.quoted)
+            self.assertEqual(exp.attr.padding, act_attr.padding)
+        self.assertIs(expected.showtag, actual.showtag)
+        self.assertIs(expected.self_closing, actual.self_closing)
+        self.assertEqual(expected.padding, actual.padding)
+        self.assertWikicodeEqual(expected.closing_tag, actual.closing_tag)
 
     def assertTemplateNodeEqual(self, expected, actual):
         """Assert that two Template nodes have the same data."""
diff --git a/tests/test_attribute.py b/tests/test_attribute.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_builder.py b/tests/test_builder.py
index 903d144..85a8c60 100644
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -190,10 +190,18 @@ class TestBuilder(TreeEqualityTestCase):
         for test, valid in tests:
             self.assertWikicodeEqual(valid, self.builder.build(test))
 
-    @unittest.skip("holding this until feature/html_tags is ready")
     def test_tag(self):
         """tests for building Tag nodes"""
-        pass
+        tests = [
+            ([tokens.TagOpenOpen(showtag=True, type=101),
+              tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""),
+              tokens.TagOpenClose(), tokens.Text(text="ref"),
+              tokens.TagCloseClose()],
+             wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "",
+                       wraptext("ref"))])),
+        ]
+        for test, valid in tests:
+            self.assertWikicodeEqual(valid, self.builder.build(test))
 
     def test_integration(self):
         """a test for building a combination of templates together"""
diff --git a/tests/test_tag.py b/tests/test_tag.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
new file mode 100644
index 0000000..9a6ce30
--- /dev/null
+++ b/tests/tokenizer/tags.mwtest
@@ -0,0 +1,88 @@
+name:   basic
+label:  a basic tag with an open and close
+input:  "<ref></ref>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   basic_selfclosing
+label:  a basic self-closing tag
+input:  "<ref/>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")]
+
+---
+
+name:   content
+label:  a tag with some content in the middle
+input:  "<ref>this is a reference</ref>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   padded_open
+label:  a tag with some padding in the open tag
+input:  "<ref ></ref>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   padded_close
+label:  a tag with some padding in the close tag
+input:  "<ref></ref >"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]
+
+---
+
+name:   padded_selfclosing
+label:  a self-closing tag with padding
+input:  "<ref />"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")]
+
+---
+
+name:   attribute
+label:  a tag with a single attribute
+input:  "<ref name></ref>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   attribute_value
+label:  a tag with a single attribute with a value
+input:  "<ref name=foo></ref>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   attribute_quoted
+label:  a tag with a single quoted attribute
+input:  "<ref name="foo"></ref>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   attribute_hyphen
+label:  a tag with a single attribute, containing a hyphen
+input:  "<ref name=foo-bar></ref>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   attribute_quoted_hyphen
+label:  a tag with a single quoted attribute, containing a hyphen
+input:  "<ref name="foo-bar"></ref>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   attribute_selfclosing
+label:  a self-closing tag with a single attribute
+input:  "<ref name/>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")]
+
+---
+
+name:   attribute_selfclosing_value
+label:  a self-closing tag with a single attribute with a value
+input:  "<ref name=foo/>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]

From 9ea06c283081771833729ec579b9aaee94599fe1 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 28 May 2013 10:58:45 -0400
Subject: [PATCH 20/77] Push the textbuffer to fix a couple broken tests.

---
 mwparserfromhell/parser/tokenizer.py | 1 +
 tests/tokenizer/tags.mwtest          | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index b466de5..b8450fd 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -452,6 +452,7 @@ class Tokenizer(object):
             self._context ^= contexts.TAG_OPEN_NAME
         self._context |= contexts.TAG_BODY
 
+        self._push_textbuffer()
         if isinstance(self._stack[-1], tokens.TagAttrStart):
             return self._stack.pop().padding
         return ""
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 9a6ce30..8716e78 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -86,3 +86,10 @@ name:   attribute_selfclosing_value
 label:  a self-closing tag with a single attribute with a value
 input:  "<ref name=foo/>"
 output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
+
+---
+
+name:   attribute_selfclosing_value_quoted
+label:  a self-closing tag with a single quoted attribute
+input:  "<ref name="foo"/>"
+output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]

From d2b39546691eda327979b12dbe44c0090868c790 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 1 Jun 2013 17:30:34 -0400
Subject: [PATCH 21/77] Fix remaining broken tests; some refactoring.

---
 mwparserfromhell/parser/tokenizer.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index b8450fd..67a652a 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -476,7 +476,7 @@ class Tokenizer(object):
             self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
             chunks.pop(0)
             return
-        elif self._context & contexts.TAG_OPEN_ATTR_QUOTED:
+        elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED:
             self._write_text(" ")  # Quoted chunks don't lose their spaces
 
         if chunks:
@@ -501,7 +501,7 @@ class Tokenizer(object):
         wikicode when we're inside of an opening tag and no :py:attr:`MARKERS`
         are present.
         """
-        if " " not in text:
+        if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
             self._write_text(text)
             return
         chunks = text.split(" ")
@@ -603,7 +603,7 @@ class Tokenizer(object):
             elif this == "\n" or this == "[" or this == "}":
                 return False
             return True
-        if context & contexts.TEMPLATE_NAME:
+        elif context & contexts.TEMPLATE_NAME:
             if this == "{" or this == "}" or this == "[":
                 self._context |= contexts.FAIL_NEXT
                 return True
@@ -621,6 +621,8 @@ class Tokenizer(object):
             elif this is self.END or not this.isspace():
                 self._context |= contexts.HAS_TEXT
             return True
+        elif context & contexts.TAG_CLOSE:
+            return this != "<" and this != "\n"
         else:
             if context & contexts.FAIL_ON_EQUALS:
                 if this == "=":
@@ -653,10 +655,12 @@ class Tokenizer(object):
         while True:
             this = self._read()
             unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
-                      contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME)
+                      contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
+                      contexts.TAG_CLOSE)
             if self._context & unsafe:
                 if not self._verify_safe(this):
-                    if self._context & contexts.TEMPLATE_PARAM_KEY:
+                    double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
+                    if self._context & double:
                         self._pop()
                     self._fail_route()
             if this not in self.MARKERS:
@@ -672,12 +676,12 @@ class Tokenizer(object):
                 fail = (
                     contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
                     contexts.HEADING | contexts.COMMENT | contexts.TAG)
-                double_fail = (
-                    contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
-                    contexts.TAG_OPEN_ATTR_QUOTED)
-                if self._context & double_fail:
-                    self._pop()
                 if self._context & fail:
+                    double_fail = (
+                        contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
+                        contexts.TAG_OPEN_ATTR_QUOTED)
+                    if self._context & double_fail:
+                        self._pop()
                     self._fail_route()
                 return self._pop()
             next = self._read(1)
@@ -738,10 +742,10 @@ class Tokenizer(object):
             elif this == "<" and next != "/" and (
                     not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
                 self._parse_tag()
-            elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED):
-                if this == "\n":
-                    if self._context & contexts.TAG_CLOSE:
-                        self._pop()
+            elif self._context & contexts.TAG_OPEN:
+                if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
+                    self._handle_tag_chunk(this)
+                elif this == "\n":
                     self._fail_route()
                 elif this == ">":
                     self._handle_tag_close_open()
@@ -749,6 +753,8 @@ class Tokenizer(object):
                     return self._handle_tag_selfclose()
                 elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
                     self._handle_tag_attribute_body()
+                else:
+                    self._handle_tag_chunk(this)
             elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
                 self._handle_tag_open_close()
             elif this == ">" and self._context & contexts.TAG_CLOSE:

From 03e41286c6caf940d9f14ae1bdbd03df4e112493 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 12 Jun 2013 18:29:22 -0400
Subject: [PATCH 22/77] Add a number of tag tests. A couple of these are
 failing.

---
 tests/tokenizer/integration.mwtest |   7 ++
 tests/tokenizer/tags.mwtest        | 140 +++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+)

diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest
index d3cb419..ba01c8c 100644
--- a/tests/tokenizer/integration.mwtest
+++ b/tests/tokenizer/integration.mwtest
@@ -33,6 +33,13 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t
 
 ---
 
+name:   rich_tags
+label:  a HTML tag with tons of other things in it
+input:  "{{dubious claim}}<ref name={{abc}}  foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} mno="{{p}} [[q]] {{r}}">[[Source]]</ref>"
+output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(padding=" "), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(padding=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(padding=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(padding=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagOpenClose(), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
 name:   wildcard
 label:  a wildcard assortment of various things
 input:  "{{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}}"
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 8716e78..5af2074 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -93,3 +93,143 @@ name:   attribute_selfclosing_value_quoted
 label:  a self-closing tag with a single quoted attribute
 input:  "<ref name="foo"/>"
 output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
+
+---
+
+name:   incomplete_lbracket
+label:  incomplete tags: just a left bracket
+input:  "<"
+output: [Text(text="<")]
+
+---
+
+name:   incomplete_lbracket_junk
+label:  incomplete tags: just a left bracket, surrounded by stuff
+input:  "foo<bar"
+output: [Text(text="foo<bar")]
+
+---
+
+name:   incomplete_unclosed_open
+label:  incomplete tags: an unclosed open tag
+input:  "junk <ref"
+output: [Text(text="junk <ref")]
+
+---
+
+name:   incomplete_unclosed_open_space
+label:  incomplete tags: an unclosed open tag, space
+input:  "junk <ref "
+output: [Text(text="junk <ref ")]
+
+---
+
+name:   incomplete_unclosed_open_unnamed_attr
+label:  incomplete tags: an unclosed open tag, unnamed attribute
+input:  "junk <ref name"
+output: [Text(text="junk <ref name")]
+
+---
+
+name:   incomplete_unclosed_open_attr_equals
+label:  incomplete tags: an unclosed open tag, attribute, equal sign
+input:  "junk <ref name="
+output: [Text(text="junk <ref name=")]
+
+---
+
+name:   incomplete_unclosed_open_attr_equals_quoted
+label:  incomplete tags: an unclosed open tag, attribute, equal sign, quote
+input:  "junk <ref name=""
+output: [Text(text="junk <ref name=\"")]
+
+---
+
+name:   incomplete_unclosed_open_attr
+label:  incomplete tags: an unclosed open tag, attribute with a key/value
+input:  "junk <ref name=foo"
+output: [Text(text="junk <ref name=foo")]
+
+---
+
+name:   incomplete_unclosed_open_attr_quoted
+label:  incomplete tags: an unclosed open tag, attribute with a key/value, quoted
+input:  "junk <ref name="foo""
+output: [Text(text="junk <ref name=\"foo\"")]
+
+---
+
+name:   incomplete_open
+label:  incomplete tags: an open tag
+input:  "junk <ref>"
+output: [Text(text="junk <ref>")]
+
+---
+
+name:   incomplete_open_unnamed_attr
+label:  incomplete tags: an open tag, unnamed attribute
+input:  "junk <ref name>"
+output: [Text(text="junk <ref name>")]
+
+---
+
+name:   incomplete_open_attr_equals
+label:  incomplete tags: an open tag, attribute, equal sign
+input:  "junk <ref name=>"
+output: [Text(text="junk <ref name=>")]
+
+---
+
+name:   incomplete_open_attr
+label:  incomplete tags: an open tag, attribute with a key/value
+input:  "junk <ref name=foo>"
+output: [Text(text="junk <ref name=foo>")]
+
+---
+
+name:   incomplete_open_attr_quoted
+label:  incomplete tags: an open tag, attribute with a key/value, quoted
+input:  "junk <ref name="foo">"
+output: [Text(text="junk <ref name=\"foo\">")]
+
+---
+
+name:   incomplete_open_text
+label:  incomplete tags: an open tag, text
+input:  "junk <ref>foo"
+output: [Text(text="junk <ref>foo")]
+
+---
+
+name:   incomplete_open_attr_text
+label:  incomplete tags: an open tag, attribute with a key/value, text
+input:  "junk <ref name=foo>bar"
+output: [Text(text="junk <ref name=foo>bar")]
+
+---
+
+name:   incomplete_open_text_lbracket
+label:  incomplete tags: an open tag, text, left open bracket
+input:  "junk <ref>bar<"
+output: [Text(text="junk <ref>bar<")]
+
+---
+
+name:   incomplete_open_text_lbracket_slash
+label:  incomplete tags: an open tag, text, left bracket, slash
+input:  "junk <ref>bar</"
+output: [Text(text="junk <ref>bar</")]
+
+---
+
+name:   incomplete_open_text_unclosed_close
+label:  incomplete tags: an open tag, text, unclosed close
+input:  "junk <ref>bar</ref"
+output: [Text(text="junk <ref>bar</ref")]
+
+---
+
+name:   incomplete_open_text_wrong_close
+label:  incomplete tags: an open tag, text, wrong close
+input:  "junk <ref>bar</span>"
+output: [Text(text="junk <ref>bar</span>")]

From 6450814729c4725760386ae9e8a24a30c46b7033 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 28 Jun 2013 23:34:24 -0400
Subject: [PATCH 23/77] Remove 'type' attribute from tags; rework tag
 definitions.

---
 mwparserfromhell/nodes/tag.py        |  30 ++-------
 mwparserfromhell/parser/builder.py   |   8 +--
 mwparserfromhell/parser/tokenizer.py |  21 ++----
 mwparserfromhell/tag_defs.py         | 123 ++++++++++-------------------------
 mwparserfromhell/utils.py            |   2 +
 tests/test_builder.py                |   9 ++-
 tests/tokenizer/tags.mwtest          |  28 ++++----
 7 files changed, 72 insertions(+), 149 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index d301d85..cd5d0a2 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -24,18 +24,17 @@ from __future__ import unicode_literals
 
 from . import Node, Text
 from ..compat import str
-from ..tag_defs import TagDefinitions
+from ..tag_defs import get_wikicode, is_visible
 from ..utils import parse_anything
 
 __all__ = ["Tag"]
 
-class Tag(TagDefinitions, Node):
+class Tag(Node):
     """Represents an HTML-style tag in wikicode, like ``<ref>``."""
 
-    def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
+    def __init__(self, tag, contents=None, attrs=None, showtag=True,
                  self_closing=False, padding="", closing_tag=None):
         super(Tag, self).__init__()
-        self._type = type_
         self._tag = tag
         self._contents = contents
         if attrs:
@@ -52,7 +51,7 @@ class Tag(TagDefinitions, Node):
 
     def __unicode__(self):
         if not self.showtag:
-            open_, close = self.WIKICODE[self.type]
+            open_, close = get_wikicode[self.tag]
             if self.self_closing:
                 return open_
             else:
@@ -84,7 +83,7 @@ class Tag(TagDefinitions, Node):
                 yield self.contents, child
 
     def __strip__(self, normalize, collapse):
-        if self.type in self.TAGS_VISIBLE:
+        if is_visible(self.tag):
             return self.contents.strip_code(normalize, collapse)
         return None
 
@@ -113,11 +112,6 @@ class Tag(TagDefinitions, Node):
             write(">")
 
     @property
-    def type(self):
-        """The tag type."""
-        return self._type
-
-    @property
     def tag(self):
         """The tag itself, as a :py:class:`~.Wikicode` object."""
         return self._tag
@@ -159,23 +153,9 @@ class Tag(TagDefinitions, Node):
         """
         return self._closing_tag
 
-    @type.setter
-    def type(self, value):
-        value = int(value)
-        if value not in self.TAGS_ALL:
-            raise ValueError(value)
-        self._type = value
-        for key in self.TRANSLATIONS:
-            if self.TRANSLATIONS[key] == value:
-                self._tag = self._closing_tag = parse_anything(key)
-
     @tag.setter
     def tag(self, value):
         self._tag = self._closing_tag = parse_anything(value)
-        try:
-            self._type = self.TRANSLATIONS[text]
-        except KeyError:
-            self._type = self.TAG_UNKNOWN
 
     @contents.setter
     def contents(self, value):
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 5ec0780..53abe91 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -202,7 +202,7 @@ class Builder(object):
 
     def _handle_tag(self, token):
         """Handle a case where a tag is at the head of the tokens."""
-        type_, showtag = token.type, token.showtag
+        showtag = token.showtag
         attrs = []
         self._push()
         while self._tokens:
@@ -215,14 +215,14 @@ class Builder(object):
                 self._push()
             elif isinstance(token, tokens.TagCloseSelfclose):
                 tag = self._pop()
-                return Tag(type_, tag, attrs=attrs, showtag=showtag,
+                return Tag(tag, attrs=attrs, showtag=showtag,
                            self_closing=True, padding=token.padding)
             elif isinstance(token, tokens.TagOpenClose):
                 contents = self._pop()
                 self._push()
             elif isinstance(token, tokens.TagCloseClose):
-                return Tag(type_, tag, contents, attrs, showtag, False,
-                           padding, self._pop())
+                return Tag(tag, contents, attrs, showtag, False, padding,
+                           self._pop())
             else:
                 self._write(self._handle_token(token))
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 67a652a..e7fdb0e 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -27,7 +27,7 @@ import re
 from . import contexts
 from . import tokens
 from ..compat import htmlentities
-from ..nodes.tag import Tag
+from ..tag_defs import is_parsable
 
 __all__ = ["Tokenizer"]
 
@@ -416,8 +416,8 @@ class Tokenizer(object):
         else:
             self._write_all(tokens)
 
-    def _get_tag_type_from_stack(self, stack=None):
-        """Return the tag type based on the text in *stack*.
+    def _get_tag_from_stack(self, stack=None):
+        """Return the tag based on the text in *stack*.
 
         If *stack* is ``None``, we will use the current, topmost one.
         """
@@ -427,11 +427,7 @@ class Tokenizer(object):
         if not stack:
             self._fail_route()  # Tag has an empty name?
         text = [tok for tok in stack if isinstance(tok, tokens.Text)]
-        text = "".join([token.text for token in text]).rstrip().lower()
-        try:
-            return Tag.TRANSLATIONS[text]
-        except KeyError:
-            return Tag.TAG_UNKNOWN
+        return "".join([token.text for token in text]).rstrip().lower()
 
     def _actually_close_tag_opening(self):
         """Handle cleanup at the end of a opening tag.
@@ -447,8 +443,7 @@ class Tokenizer(object):
             if self._context & contexts.TAG_OPEN_ATTR_BODY:
                 self._context ^= contexts.TAG_OPEN_ATTR_BODY
         else:
-            tag = self._get_tag_type_from_stack()
-            self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
+            self._write_first(tokens.TagOpenOpen(showtag=True))
             self._context ^= contexts.TAG_OPEN_NAME
         self._context |= contexts.TAG_BODY
 
@@ -509,8 +504,7 @@ class Tokenizer(object):
         is_quoted = False
         if self._context & contexts.TAG_OPEN_NAME:
             self._write_text(chunks.pop(0))
-            tag = self._get_tag_type_from_stack()
-            self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
+            self._write_first(tokens.TagOpenOpen(showtag=True))
             self._context ^= contexts.TAG_OPEN_NAME
             self._context |= contexts.TAG_OPEN_ATTR_NAME
             self._actually_handle_chunk(chunks, True)
@@ -584,8 +578,7 @@ class Tokenizer(object):
     def _handle_tag_close_close(self):
         """Handle the ending of a closing tag (``</foo>``)."""
         closing = self._pop()
-        tag = self._get_tag_type_from_stack(closing)
-        if tag != self._stack[0].type:
+        if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
             # Closing and opening tags are not the same, so fail this route:
             self._fail_route()
         self._write_all(closing)
diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py
index b2ee90d..369692b 100644
--- a/mwparserfromhell/tag_defs.py
+++ b/mwparserfromhell/tag_defs.py
@@ -20,99 +20,48 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from __future__ import unicode_literals
+"""Contains data regarding certain HTML tags."""
 
-class TagDefinitions(object):
-    """Contains numerical definitions for valid HTML (and wikicode) tags.
+from __future__ import unicode_literals
 
-    Base class for :py:class:`~.Tag` objects.
-    """
+__all__ = ["get_wikicode", "is_parsable", "is_visible"]
 
-    TAG_UNKNOWN = 0
+PARSER_BLACKLIST = [
+    # enwiki extensions @ 2013-06-28
+    "categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
+    "nowiki", "pre", "score", "section", "source", "syntaxhighlight",
+    "templatedata", "timeline"
+]
 
-    # Basic HTML:
-    TAG_ITALIC = 1
-    TAG_BOLD = 2
-    TAG_UNDERLINE = 3
-    TAG_STRIKETHROUGH = 4
-    TAG_UNORDERED_LIST = 5
-    TAG_ORDERED_LIST = 6
-    TAG_DEF_TERM = 7
-    TAG_DEF_ITEM = 8
-    TAG_BLOCKQUOTE = 9
-    TAG_RULE = 10
-    TAG_BREAK = 11
-    TAG_ABBR = 12
-    TAG_PRE = 13
-    TAG_MONOSPACE = 14
-    TAG_CODE = 15
-    TAG_SPAN = 16
-    TAG_DIV = 17
-    TAG_FONT = 18
-    TAG_SMALL = 19
-    TAG_BIG = 20
-    TAG_CENTER = 21
+INVISIBLE_TAGS = [
+    # enwiki extensions @ 2013-06-28
+    "categorytree", "gallery", "imagemap", "inputbox", "math", "score",
+    "section", "templatedata", "timeline"
+]
 
-    # MediaWiki parser hooks:
-    TAG_REF = 101
-    TAG_GALLERY = 102
-    TAG_MATH = 103
-    TAG_NOWIKI = 104
-    TAG_NOINCLUDE = 105
-    TAG_INCLUDEONLY = 106
-    TAG_ONLYINCLUDE = 107
+# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
+SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
+SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]
 
-    # Additional parser hooks:
-    TAG_SYNTAXHIGHLIGHT = 201
-    TAG_POEM = 202
+WIKICODE = {
+    "i": {"open": "''", "close": "''"},
+    "b": {"open": "'''", "close": "'''"},
+    "ul": {"open": "*"},
+    "ol": {"open": "#"},
+    "dt": {"open": ";"},
+    "dd": {"open": ":"},
+    "hr": {"open": "----"},
+}
 
-    # Lists of tags:
-    TAGS_ALL = set(range(300))
-    TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE}
-    TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE
+def get_wikicode(tag):
+    """Return the appropriate wikicode before and after the given *tag*."""
+    data = WIKICODE[tag.lower()]
+    return (data.get("open"), data.get("close"))
 
-    TRANSLATIONS = {
-        "i": TAG_ITALIC,
-        "em": TAG_ITALIC,
-        "b": TAG_BOLD,
-        "strong": TAG_BOLD,
-        "u": TAG_UNDERLINE,
-        "s": TAG_STRIKETHROUGH,
-        "ul": TAG_UNORDERED_LIST,
-        "ol": TAG_ORDERED_LIST,
-        "dt": TAG_DEF_TERM,
-        "dd": TAG_DEF_ITEM,
-        "blockquote": TAG_BLOCKQUOTE,
-        "hl": TAG_RULE,
-        "br": TAG_BREAK,
-        "abbr": TAG_ABBR,
-        "pre": TAG_PRE,
-        "tt": TAG_MONOSPACE,
-        "code": TAG_CODE,
-        "span": TAG_SPAN,
-        "div": TAG_DIV,
-        "font": TAG_FONT,
-        "small": TAG_SMALL,
-        "big": TAG_BIG,
-        "center": TAG_CENTER,
-        "ref": TAG_REF,
-        "gallery": TAG_GALLERY,
-        "math": TAG_MATH,
-        "nowiki": TAG_NOWIKI,
-        "noinclude": TAG_NOINCLUDE,
-        "includeonly": TAG_INCLUDEONLY,
-        "onlyinclude": TAG_ONLYINCLUDE,
-        "syntaxhighlight": TAG_SYNTAXHIGHLIGHT,
-        "source": TAG_SYNTAXHIGHLIGHT,
-        "poem": TAG_POEM,
-    }
+def is_parsable(tag):
+    """Return if the given *tag*'s contents should be passed to the parser."""
+    return tag.lower() not in PARSER_BLACKLIST
 
-    WIKICODE = {
-        TAG_ITALIC: ("''", "''"),
-        TAG_BOLD: ("'''", "'''"),
-        TAG_UNORDERED_LIST: ("*", ""),
-        TAG_ORDERED_LIST: ("#", ""),
-        TAG_DEF_TERM: (";", ""),
-        TAG_DEF_ITEM: (":", ""),
-        TAG_RULE: ("----", ""),
-    }
+def is_visible(tag):
+    """Return whether or not the given *tag* contains visible text."""
+    return tag.lower() not in INVISIBLE_TAGS
diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py
index b797419..31e5ba0 100644
--- a/mwparserfromhell/utils.py
+++ b/mwparserfromhell/utils.py
@@ -31,6 +31,8 @@ from .compat import bytes, str
 from .nodes import Node
 from .smart_list import SmartList
 
+__all__ = ["parse_anything"]
+
 def parse_anything(value):
     """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types.
 
diff --git a/tests/test_builder.py b/tests/test_builder.py
index 85a8c60..0c635ce 100644
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -193,11 +193,10 @@ class TestBuilder(TreeEqualityTestCase):
     def test_tag(self):
         """tests for building Tag nodes"""
         tests = [
-            ([tokens.TagOpenOpen(showtag=True, type=101),
-              tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""),
-              tokens.TagOpenClose(), tokens.Text(text="ref"),
-              tokens.TagCloseClose()],
-             wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "",
+            ([tokens.TagOpenOpen(showtag=True), tokens.Text(text="ref"),
+              tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(),
+              tokens.Text(text="ref"), tokens.TagCloseClose()],
+             wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "",
                        wraptext("ref"))])),
         ]
         for test, valid in tests:
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 5af2074..a76d6b6 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -1,98 +1,98 @@
 name:   basic
 label:  a basic tag with an open and close
 input:  "<ref></ref>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   basic_selfclosing
 label:  a basic self-closing tag
 input:  "<ref/>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   content
 label:  a tag with some content in the middle
 input:  "<ref>this is a reference</ref>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   padded_open
 label:  a tag with some padding in the open tag
 input:  "<ref ></ref>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   padded_close
 label:  a tag with some padding in the close tag
 input:  "<ref></ref >"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]
 
 ---
 
 name:   padded_selfclosing
 label:  a self-closing tag with padding
 input:  "<ref />"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding=" ")]
 
 ---
 
 name:   attribute
 label:  a tag with a single attribute
 input:  "<ref name></ref>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_value
 label:  a tag with a single attribute with a value
 input:  "<ref name=foo></ref>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_quoted
 label:  a tag with a single quoted attribute
 input:  "<ref name="foo"></ref>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_hyphen
 label:  a tag with a single attribute, containing a hyphen
 input:  "<ref name=foo-bar></ref>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_quoted_hyphen
 label:  a tag with a single quoted attribute, containing a hyphen
 input:  "<ref name="foo-bar"></ref>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_selfclosing
 label:  a self-closing tag with a single attribute
 input:  "<ref name/>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   attribute_selfclosing_value
 label:  a self-closing tag with a single attribute with a value
 input:  "<ref name=foo/>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   attribute_selfclosing_value_quoted
 label:  a self-closing tag with a single quoted attribute
 input:  "<ref name="foo"/>"
-output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
 
 ---
 

From ce27d5d385a4adc14e136b33471216038dfc70a1 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 29 Jun 2013 00:33:41 -0400
Subject: [PATCH 24/77] Fix six failing tests; add three more (all passing).

---
 mwparserfromhell/parser/tokenizer.py | 33 ++++++++++++++++++---------------
 tests/tokenizer/tags.mwtest          | 21 +++++++++++++++++++++
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index e7fdb0e..93e9a8d 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 from __future__ import unicode_literals
+from itertools import takewhile
 from math import log
 import re
 
@@ -416,19 +417,6 @@ class Tokenizer(object):
         else:
             self._write_all(tokens)
 
-    def _get_tag_from_stack(self, stack=None):
-        """Return the tag based on the text in *stack*.
-
-        If *stack* is ``None``, we will use the current, topmost one.
-        """
-        if stack is None:
-            stack = self._stack
-            self._push_textbuffer()
-        if not stack:
-            self._fail_route()  # Tag has an empty name?
-        text = [tok for tok in stack if isinstance(tok, tokens.Text)]
-        return "".join([token.text for token in text]).rstrip().lower()
-
     def _actually_close_tag_opening(self):
         """Handle cleanup at the end of a opening tag.
 
@@ -557,14 +545,27 @@ class Tokenizer(object):
             while chunks:
                 self._actually_handle_chunk(chunks, True)
 
+    def _get_tag_from_stack(self, stack=None):
+        """Return the tag based on the text in *stack*."""
+        if not stack:
+            sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen)
+            func = lambda tok: not isinstance(tok, sentinels)
+            stack = takewhile(func, self._stack)
+        text = [tok.text for tok in stack if isinstance(tok, tokens.Text)]
+        return "".join(text).rstrip().lower()
+
     def _handle_tag_close_open(self):
         """Handle the ending of an open tag (``<foo>``)."""
         padding = self._actually_close_tag_opening()
+        if not self._get_tag_from_stack():  # Tags cannot be blank
+            self._fail_route()
         self._write(tokens.TagCloseOpen(padding=padding))
 
     def _handle_tag_selfclose(self):
         """Handle the ending of an tag that closes itself (``<foo />``)."""
         padding = self._actually_close_tag_opening()
+        if not self._get_tag_from_stack():  # Tags cannot be blank
+            self._fail_route()
         self._write(tokens.TagCloseSelfclose(padding=padding))
         self._head += 1
         return self._pop()
@@ -578,8 +579,10 @@ class Tokenizer(object):
     def _handle_tag_close_close(self):
         """Handle the ending of a closing tag (``</foo>``)."""
         closing = self._pop()
-        if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
-            # Closing and opening tags are not the same, so fail this route:
+        close_tag = self._get_tag_from_stack(closing)
+        open_tag = self._get_tag_from_stack()
+        if not close_tag or close_tag != open_tag:
+            # Closing and opening tags are empty or unequal, so fail this tag:
             self._fail_route()
         self._write_all(closing)
         self._write(tokens.TagCloseClose())
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index a76d6b6..849a4fd 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -233,3 +233,24 @@ name:   incomplete_open_text_wrong_close
 label:  incomplete tags: an open tag, text, wrong close
 input:  "junk <ref>bar</span>"
 output: [Text(text="junk <ref>bar</span>")]
+
+---
+
+name:   incomplete_no_tag_name_open
+label:  incomplete tags: no tag name within brackets; just an open
+input:  "junk <>"
+output: [Text(text="junk <>")]
+
+---
+
+name:   incomplete_no_tag_name_selfclosing
+label:  incomplete tags: no tag name within brackets; self-closing
+input:  "junk < />"
+output: [Text(text="junk < />")]
+
+---
+
+name:   incomplete_no_tag_name_open_close
+label:  incomplete tags: no tag name within brackets; open and close
+input:  "junk <></>"
+output: [Text(text="junk <></>")]

From c241bff9f50896d83294ed12c72b8d59dc932b2b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 29 Jun 2013 00:37:29 -0400
Subject: [PATCH 25/77] Remove .type check from assertTagNodeEqual()

---
 tests/_test_tree_equality.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py
index 2828147..6976a13 100644
--- a/tests/_test_tree_equality.py
+++ b/tests/_test_tree_equality.py
@@ -91,7 +91,6 @@ class TreeEqualityTestCase(TestCase):
 
     def assertTagNodeEqual(self, expected, actual):
         """Assert that two Tag nodes have the same data."""
-        self.assertEqual(expected.type, actual.type)
         self.assertWikicodeEqual(expected.tag, actual.tag)
         if expected.contents is not None:
             self.assertWikicodeEqual(expected.contents, actual.contents)

From 81e8fdd6829c12468f0f12c71d707c452eb9e2bb Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 30 Jun 2013 20:57:54 -0400
Subject: [PATCH 26/77] Give Attributes more attributes for padding data.

---
 mwparserfromhell/nodes/extras/attribute.py | 41 ++++++++++++++++++++++--------
 mwparserfromhell/nodes/tag.py              |  2 +-
 mwparserfromhell/parser/builder.py         | 13 ++++++----
 tests/tokenizer/tags.mwtest                | 16 ++++++------
 4 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py
index 33ad851..5888dba 100644
--- a/mwparserfromhell/nodes/extras/attribute.py
+++ b/mwparserfromhell/nodes/extras/attribute.py
@@ -36,19 +36,22 @@ class Attribute(StringMixIn):
     whose value is ``"foo"``.
     """
 
-    def __init__(self, name, value=None, quoted=True, padding=""):
+    def __init__(self, name, value=None, quoted=True, pad_first="",
+                 pad_before_eq="", pad_after_eq=""):
         super(Attribute, self).__init__()
         self._name = name
         self._value = value
         self._quoted = quoted
-        self._padding = padding
+        self._pad_first = pad_first
+        self._pad_before_eq = pad_before_eq
+        self._pad_after_eq = pad_after_eq
 
     def __unicode__(self):
-        base = self.padding + str(self.name)
+        base = self.pad_first + str(self.name) + self.pad_before_eq
         if self.value:
             if self.quoted:
-                return base + '="' + str(self.value) + '"'
-            return base + "=" + str(self.value)
+                return base + '="' + self.pad_after_eq + str(self.value) + '"'
+            return base + "=" + self.pad_after_eq + str(self.value)
         return base
 
     @property
@@ -67,9 +70,19 @@ class Attribute(StringMixIn):
         return self._quoted
 
     @property
-    def padding(self):
+    def pad_first(self):
         """Spacing to insert right before the attribute."""
-        return self._padding
+        return self._pad_first
+
+    @property
+    def pad_before_eq(self):
+        """Spacing to insert right before the equal sign."""
+        return self._pad_before_eq
+
+    @property
+    def pad_after_eq(self):
+        """Spacing to insert right after the equal sign."""
+        return self._pad_after_eq
 
     @name.setter
     def name(self, value):
@@ -83,6 +96,14 @@ class Attribute(StringMixIn):
     def quoted(self, value):
         self._quoted = bool(value)
 
-    @padding.setter
-    def padding(self, value):
-        self._padding = str(value)
+    @pad_first.setter
+    def pad_first(self, value):
+        self._pad_first = str(value)
+
+    @pad_before_eq.setter
+    def pad_before_eq(self, value):
+        self._pad_before_eq = str(value)
+
+    @pad_after_eq.setter
+    def pad_after_eq(self, value):
+        self._pad_after_eq = str(value)
diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index cd5d0a2..76b412c 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -59,7 +59,7 @@ class Tag(Node):
 
         result = "<" + str(self.tag)
         if self.attributes:
-            result += " " + " ".join([str(attr) for attr in self.attributes])
+            result += "".join([str(attr) for attr in self.attributes])
         if self.self_closing:
             result += self.padding + "/>"
         else:
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 53abe91..d92b845 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -180,9 +180,9 @@ class Builder(object):
             else:
                 self._write(self._handle_token(token))
 
-    def _handle_attribute(self, token):
+    def _handle_attribute(self, start):
         """Handle a case where a tag attribute is at the head of the tokens."""
-        name, quoted, padding = None, False, token.padding
+        name, quoted = None, False
         self._push()
         while self._tokens:
             token = self._tokens.pop()
@@ -194,9 +194,12 @@ class Builder(object):
             elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
                                     tokens.TagCloseSelfclose)):
                 self._tokens.append(token)
-                if name is not None:
-                    return Attribute(name, self._pop(), quoted, padding)
-                return Attribute(self._pop(), quoted=quoted, padding=padding)
+                if name:
+                    value = self._pop()
+                else:
+                    name, value = self._pop(), None
+                return Attribute(name, value, quoted, start.pad_first,
+                                 start.pad_before_eq, start.pad_after_eq)
             else:
                 self._write(self._handle_token(token))
 
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 849a4fd..1dfc1b1 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -43,56 +43,56 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding=
 name:   attribute
 label:  a tag with a single attribute
 input:  "<ref name></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_value
 label:  a tag with a single attribute with a value
 input:  "<ref name=foo></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_quoted
 label:  a tag with a single quoted attribute
 input:  "<ref name="foo"></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_hyphen
 label:  a tag with a single attribute, containing a hyphen
 input:  "<ref name=foo-bar></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_quoted_hyphen
 label:  a tag with a single quoted attribute, containing a hyphen
 input:  "<ref name="foo-bar"></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_selfclosing
 label:  a self-closing tag with a single attribute
 input:  "<ref name/>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   attribute_selfclosing_value
 label:  a self-closing tag with a single attribute with a value
 input:  "<ref name=foo/>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   attribute_selfclosing_value_quoted
 label:  a self-closing tag with a single quoted attribute
 input:  "<ref name="foo"/>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
 
 ---
 

From 5f5a081d9148c584511bffb3d6d3b8f63ea24d43 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 30 Jun 2013 21:02:11 -0400
Subject: [PATCH 27/77] Rewrite tag parser to be cleaner and safer.

All tag tests passing. Still need to finish backslash support and
support for templates and tags within <open> tags.
---
 mwparserfromhell/parser/contexts.py  |  87 ++++-----
 mwparserfromhell/parser/tokenizer.py | 339 ++++++++++++++++-------------------
 2 files changed, 194 insertions(+), 232 deletions(-)

diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index 3c9c798..9e5e568 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -65,15 +65,7 @@ Local (stack-specific) contexts:
 * :py:const:`TAG`
 
     * :py:const:`TAG_OPEN`
-
-        * :py:const:`TAG_OPEN_NAME`
-        * :py:const:`TAG_OPEN_ATTR`
-
-            * :py:const:`TAG_OPEN_ATTR_NAME`
-            * :py:const:`TAG_OPEN_ATTR_BODY`
-            * :py:const:`TAG_OPEN_ATTR_QUOTED`
-            * :py:const:`TAG_OPEN_ATTR_IGNORE`
-
+    * :py:const:`TAG_ATTR`
     * :py:const:`TAG_BODY`
     * :py:const:`TAG_CLOSE`
 
@@ -93,47 +85,42 @@ Global contexts:
 
 # Local contexts:
 
-TEMPLATE =             0b000000000000000000000000111
-TEMPLATE_NAME =        0b000000000000000000000000001
-TEMPLATE_PARAM_KEY =   0b000000000000000000000000010
-TEMPLATE_PARAM_VALUE = 0b000000000000000000000000100
-
-ARGUMENT =             0b000000000000000000000011000
-ARGUMENT_NAME =        0b000000000000000000000001000
-ARGUMENT_DEFAULT =     0b000000000000000000000010000
-
-WIKILINK =             0b000000000000000000001100000
-WIKILINK_TITLE =       0b000000000000000000000100000
-WIKILINK_TEXT =        0b000000000000000000001000000
-
-HEADING =              0b000000000000001111110000000
-HEADING_LEVEL_1 =      0b000000000000000000010000000
-HEADING_LEVEL_2 =      0b000000000000000000100000000
-HEADING_LEVEL_3 =      0b000000000000000001000000000
-HEADING_LEVEL_4 =      0b000000000000000010000000000
-HEADING_LEVEL_5 =      0b000000000000000100000000000
-HEADING_LEVEL_6 =      0b000000000000001000000000000
-
-COMMENT =              0b000000000000010000000000000
-
-TAG =                  0b000000111111100000000000000
-TAG_OPEN =             0b000000001111100000000000000
-TAG_OPEN_NAME =        0b000000000000100000000000000
-TAG_OPEN_ATTR =        0b000000001111000000000000000
-TAG_OPEN_ATTR_NAME =   0b000000000001000000000000000
-TAG_OPEN_ATTR_BODY =   0b000000000010000000000000000
-TAG_OPEN_ATTR_QUOTED = 0b000000000100000000000000000
-TAG_OPEN_ATTR_IGNORE = 0b000000001000000000000000000
-TAG_BODY =             0b000000010000000000000000000
-TAG_CLOSE =            0b000000100000000000000000000
-
-SAFETY_CHECK =         0b111111000000000000000000000
-HAS_TEXT =             0b000001000000000000000000000
-FAIL_ON_TEXT =         0b000010000000000000000000000
-FAIL_NEXT  =           0b000100000000000000000000000
-FAIL_ON_LBRACE =       0b001000000000000000000000000
-FAIL_ON_RBRACE =       0b010000000000000000000000000
-FAIL_ON_EQUALS =       0b100000000000000000000000000
+TEMPLATE =             0b000000000000000000000111
+TEMPLATE_NAME =        0b000000000000000000000001
+TEMPLATE_PARAM_KEY =   0b000000000000000000000010
+TEMPLATE_PARAM_VALUE = 0b000000000000000000000100
+
+ARGUMENT =             0b000000000000000000011000
+ARGUMENT_NAME =        0b000000000000000000001000
+ARGUMENT_DEFAULT =     0b000000000000000000010000
+
+WIKILINK =             0b000000000000000001100000
+WIKILINK_TITLE =       0b000000000000000000100000
+WIKILINK_TEXT =        0b000000000000000001000000
+
+HEADING =              0b000000000001111110000000
+HEADING_LEVEL_1 =      0b000000000000000010000000
+HEADING_LEVEL_2 =      0b000000000000000100000000
+HEADING_LEVEL_3 =      0b000000000000001000000000
+HEADING_LEVEL_4 =      0b000000000000010000000000
+HEADING_LEVEL_5 =      0b000000000000100000000000
+HEADING_LEVEL_6 =      0b000000000001000000000000
+
+COMMENT =              0b000000000010000000000000
+
+TAG =                  0b000000111100000000000000
+TAG_OPEN =             0b000000000100000000000000
+TAG_ATTR =             0b000000001000000000000000
+TAG_BODY =             0b000000010000000000000000
+TAG_CLOSE =            0b000000100000000000000000
+
+SAFETY_CHECK =         0b111111000000000000000000
+HAS_TEXT =             0b000001000000000000000000
+FAIL_ON_TEXT =         0b000010000000000000000000
+FAIL_NEXT  =           0b000100000000000000000000
+FAIL_ON_LBRACE =       0b001000000000000000000000
+FAIL_ON_RBRACE =       0b010000000000000000000000
+FAIL_ON_EQUALS =       0b100000000000000000000000
 
 # Global contexts:
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 93e9a8d..a7b9e16 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -37,6 +37,26 @@ class BadRoute(Exception):
     pass
 
 
+class _TagOpenData(object):
+    """Stores data about an HTML open tag, like ``<ref name="foo">``."""
+    CX_NAME =        1 << 0
+    CX_ATTR_READY =  1 << 1
+    CX_ATTR_NAME =   1 << 2
+    CX_ATTR_VALUE =  1 << 3
+    CX_NEED_SPACE =  1 << 4
+    CX_NEED_EQUALS = 1 << 5
+    CX_NEED_QUOTE =  1 << 6
+    CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE
+
+    def __init__(self):
+        self.context = self.CX_NAME
+        self.literal = True
+        self.padding_buffer = []
+        self.quote_buffer = []
+        self.reset = 0
+        self.ignore_quote = False
+
+
 class Tokenizer(object):
     """Creates a list of tokens from a string of wikicode."""
     USES_C = False
@@ -47,6 +67,7 @@ class Tokenizer(object):
     MAX_DEPTH = 40
     MAX_CYCLES = 100000
     regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
+    tag_splitter = re.compile(r"([\s\"\\])")
 
     def __init__(self):
         self._text = None
@@ -410,165 +431,145 @@ class Tokenizer(object):
         reset = self._head
         self._head += 1
         try:
-            tokens = self._parse(contexts.TAG_OPEN_NAME)
+            tokens = self._really_parse_tag()
         except BadRoute:
             self._head = reset
             self._write_text("<")
         else:
             self._write_all(tokens)
 
-    def _actually_close_tag_opening(self):
-        """Handle cleanup at the end of a opening tag.
-
-        The current context will be updated and the
-        :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the
-        opening tag's padding to be used in the
-        :py:class:`~.tokens.TagOpenClose` token.
-        """
-        if self._context & contexts.TAG_OPEN_ATTR:
-            if self._context & contexts.TAG_OPEN_ATTR_NAME:
-                self._context ^= contexts.TAG_OPEN_ATTR_NAME
-            if self._context & contexts.TAG_OPEN_ATTR_BODY:
-                self._context ^= contexts.TAG_OPEN_ATTR_BODY
-        else:
-            self._write_first(tokens.TagOpenOpen(showtag=True))
-            self._context ^= contexts.TAG_OPEN_NAME
-        self._context |= contexts.TAG_BODY
-
-        self._push_textbuffer()
-        if isinstance(self._stack[-1], tokens.TagAttrStart):
-            return self._stack.pop().padding
-        return ""
-
-    def _actually_handle_chunk(self, chunks, is_new):
-        """Actually handle a chunk of code within a tag's attributes.
+    def _really_parse_tag(self):
+        """Actually parse an HTML tag, starting with the open (``<foo>``)."""
+        data = _TagOpenData()
+        self._push(contexts.TAG_OPEN)
+        self._write(tokens.TagOpenOpen(showtag=True))
+        while True:
+            this, next = self._read(), self._read(1)
+            if this not in self.MARKERS:
+                for chunk in self.tag_splitter.split(this):
+                    if self._handle_tag_chunk(data, chunk):
+                        continue
+            elif this is self.END:
+                if self._context & contexts.TAG_ATTR:
+                    self._pop()
+                self._fail_route()
+            elif this == ">" and data.literal:
+                if data.context & data.CX_ATTR:
+                    self._push_tag_buffer(data)
+                padding = data.padding_buffer[0] if data.padding_buffer else ""
+                self._write(tokens.TagCloseOpen(padding=padding))
+                self._context = contexts.TAG_BODY
+                self._head += 1
+                return self._parse(push=False)
+            elif this == "/" and next == ">" and data.literal:
+                if data.context & data.CX_ATTR:
+                    self._push_tag_buffer(data)
+                padding = data.padding_buffer[0] if data.padding_buffer else ""
+                self._write(tokens.TagCloseSelfclose(padding=padding))
+                self._head += 1
+                return self._pop()
+            else:
+                for chunk in self.tag_splitter.split(this):
+                    if self._handle_tag_chunk(data, chunk):
+                        continue
+            self._head += 1
 
-        Called by :py:meth:`_handle_tag_chunk` and
-        :py:meth:`_handle_tag_attribute_body`.
-        """
-        if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
-            padding = 0
-            while chunks:
-                if chunks[0] == "":
-                    padding += 1
-                    chunks.pop(0)
-                else:
-                    break
-            self._write(tokens.TagAttrStart(padding=" " * padding))
-        elif self._context & contexts.TAG_OPEN_ATTR_IGNORE:
-            self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
-            chunks.pop(0)
+    def _handle_tag_chunk(self, data, chunk):
+        if not chunk:
             return
-        elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED:
-            self._write_text(" ")  # Quoted chunks don't lose their spaces
-
-        if chunks:
-            chunk = chunks.pop(0)
-            if self._context & contexts.TAG_OPEN_ATTR_BODY:
-                self._context ^= contexts.TAG_OPEN_ATTR_BODY
-                self._context |= contexts.TAG_OPEN_ATTR_NAME
-            if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
-                if re.search(r'[^\\]"', chunk[:-1]):
-                    self._fail_route()
-                if re.search(r'[^\\]"$', chunk):
-                    self._write_text(chunk[:-1])
-                    self._context ^= contexts.TAG_OPEN_ATTR_QUOTED
-                    self._context |= contexts.TAG_OPEN_ATTR_NAME
-                    return True  # Back to _handle_tag_attribute_body()
+        if data.context & data.CX_NAME:
+            if chunk != chunk.lstrip():  # Tags cannot start with whitespace
+                self._fail_route()
             self._write_text(chunk)
-
-    def _handle_tag_chunk(self, text):
-        """Handle a chunk of code within a tag's attributes.
-
-        This is called by :py:meth:`_parse`, which intercepts parsing of
-        wikicode when we're inside of an opening tag and no :py:attr:`MARKERS`
-        are present.
-        """
-        if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
-            self._write_text(text)
-            return
-        chunks = text.split(" ")
-        is_new = False
-        is_quoted = False
-        if self._context & contexts.TAG_OPEN_NAME:
-            self._write_text(chunks.pop(0))
-            self._write_first(tokens.TagOpenOpen(showtag=True))
-            self._context ^= contexts.TAG_OPEN_NAME
-            self._context |= contexts.TAG_OPEN_ATTR_NAME
-            self._actually_handle_chunk(chunks, True)
-            is_new = True
-        while chunks:
-            result = self._actually_handle_chunk(chunks, is_new)
-            is_quoted = result or is_quoted
-            is_new = True
-        if is_quoted:
-            return self._pop()
-
-    def _handle_tag_attribute_body(self):
-        """Handle the body, or value, of a tag attribute.
-
-        Attribute bodies can usually be handled at once, but sometimes a new
-        stack must be created to keep track of "rich" attribute values that
-        contain, for example, templates.
-        """
-        self._context ^= contexts.TAG_OPEN_ATTR_NAME
-        self._context |= contexts.TAG_OPEN_ATTR_BODY
-        self._write(tokens.TagAttrEquals())
-        next = self._read(1)
-        if next not in self.MARKERS and next.startswith('"'):
-            chunks = None
-            if " " in next:
-                chunks = next.split(" ")
-                next = chunks.pop(0)
-            if re.search(r'[^\\]"$', next[1:]):
-                if not re.search(r'[^\\]"', next[1:-1]):
-                    self._write(tokens.TagAttrQuote())
-                    self._write_text(next[1:-1])
-                    self._head += 1
+            data.context = data.CX_NEED_SPACE
+        elif data.context & data.CX_NEED_SPACE:
+            if chunk.isspace():
+                if data.context & data.CX_ATTR_VALUE:
+                    self._push_tag_buffer(data)
+                data.padding_buffer.append(chunk)
+                data.context = data.CX_ATTR_READY
             else:
-                if not re.search(r'[^\\]"', next[1:]):
-                    self._head += 1
-                    reset = self._head
-                    try:
-                        attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED |
-                                           contexts.TAG_OPEN_ATTR_IGNORE)
-                    except BadRoute:
-                        self._head = reset
-                        self._write_text(next)
-                    else:
-                        self._write(tokens.TagAttrQuote())
-                        self._write_text(next[1:])
-                        self._write_all(attr)
-                        return
-            self._context ^= contexts.TAG_OPEN_ATTR_BODY
-            self._context |= contexts.TAG_OPEN_ATTR_NAME
-            while chunks:
-                self._actually_handle_chunk(chunks, True)
+                if data.context & data.CX_ATTR_VALUE:
+                    data.context ^= data.CX_NEED_SPACE
+                    data.quote_buffer = []
+                    data.ignore_quote = True
+                    self._head = data.reset
+                    return True  # Break out of chunk processing early
+                else:
+                    self._fail_route()
+        elif data.context & data.CX_ATTR_READY:
+            if chunk.isspace():
+                data.padding_buffer.append(chunk)
+            else:
+                data.context = data.CX_ATTR_NAME
+                self._push(contexts.TAG_ATTR)
+                self._write_text(chunk)                        ### hook on here for {, <, etc
+        elif data.context & data.CX_ATTR_NAME:
+            if chunk.isspace():
+                data.padding_buffer.append(chunk)
+                data.context |= data.CX_NEED_EQUALS
+            elif chunk == "=":
+                if not data.context & data.CX_NEED_EQUALS:
+                    data.padding_buffer.append("")  # No padding before equals
+                data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE
+                self._write(tokens.TagAttrEquals())
+            else:
+                if data.context & data.CX_NEED_EQUALS:
+                    self._push_tag_buffer(data)
+                    data.padding_buffer.append("")  # No padding before tag
+                    data.context = data.CX_ATTR_NAME
+                    self._push(contexts.TAG_ATTR)
+                self._write_text(chunk)                        ### hook on here for {, <, etc
+        elif data.context & data.CX_ATTR_VALUE:
+            ### handle backslashes here
+            if data.context & data.CX_NEED_QUOTE:
+                if chunk == '"' and not data.ignore_quote:
+                    data.context ^= data.CX_NEED_QUOTE
+                    data.literal = False
+                    data.reset = self._head
+                elif chunk.isspace():
+                    data.padding_buffer.append(chunk)
+                else:
+                    data.context ^= data.CX_NEED_QUOTE
+                    self._write_text(chunk)                    ### hook on here for {, <, etc
+            elif not data.literal:
+                if chunk == '"':
+                    data.context |= data.CX_NEED_SPACE
+                    data.literal = True
+                else:
+                    data.quote_buffer.append(chunk)
+            elif chunk.isspace():
+                self._push_tag_buffer(data)
+                data.padding_buffer.append(chunk)
+                data.context = data.CX_ATTR_READY
+            else:
+                self._write_text(chunk)                        ### hook on here for {, <, etc
+
+    def _push_tag_buffer(self, data):
+        buf = data.padding_buffer
+        while len(buf) < 3:
+            buf.append("")
+        self._write_first(tokens.TagAttrStart(
+            pad_after_eq=buf.pop(), pad_before_eq=buf.pop(),
+            pad_first=buf.pop()))
+        if data.quote_buffer:
+            self._write(tokens.TagAttrQuote())
+            self._write_text("".join(data.quote_buffer))
+        self._write_all(self._pop())
+        data.padding_buffer, data.quote_buffer = [], []
+        data.ignore_quote = False
 
     def _get_tag_from_stack(self, stack=None):
         """Return the tag based on the text in *stack*."""
         if not stack:
             sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen)
-            func = lambda tok: not isinstance(tok, sentinels)
-            stack = takewhile(func, self._stack)
+            pred = lambda tok: not isinstance(tok, sentinels)
+            stack = takewhile(pred, self._stack)
         text = [tok.text for tok in stack if isinstance(tok, tokens.Text)]
-        return "".join(text).rstrip().lower()
-
-    def _handle_tag_close_open(self):
-        """Handle the ending of an open tag (``<foo>``)."""
-        padding = self._actually_close_tag_opening()
-        if not self._get_tag_from_stack():  # Tags cannot be blank
-            self._fail_route()
-        self._write(tokens.TagCloseOpen(padding=padding))
-
-    def _handle_tag_selfclose(self):
-        """Handle the ending of an tag that closes itself (``<foo />``)."""
-        padding = self._actually_close_tag_opening()
-        if not self._get_tag_from_stack():  # Tags cannot be blank
+        try:
+            return "".join(text).rstrip().lower().split()[0]
+        except IndexError:
             self._fail_route()
-        self._write(tokens.TagCloseSelfclose(padding=padding))
-        self._head += 1
-        return self._pop()
 
     def _handle_tag_open_close(self):
         """Handle the opening of a closing tag (``</foo>``)."""
@@ -579,10 +580,7 @@ class Tokenizer(object):
     def _handle_tag_close_close(self):
         """Handle the ending of a closing tag (``</foo>``)."""
         closing = self._pop()
-        close_tag = self._get_tag_from_stack(closing)
-        open_tag = self._get_tag_from_stack()
-        if not close_tag or close_tag != open_tag:
-            # Closing and opening tags are empty or unequal, so fail this tag:
+        if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
             self._fail_route()
         self._write_all(closing)
         self._write(tokens.TagCloseClose())
@@ -645,37 +643,30 @@ class Tokenizer(object):
                 self._context |= contexts.FAIL_ON_RBRACE
             return True
 
-    def _parse(self, context=0):
+    def _parse(self, context=0, push=True):
         """Parse the wikicode string, using *context* for when to stop."""
-        self._push(context)
+        unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
+                  contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
+                  contexts.TAG_CLOSE)
+        fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
+                contexts.HEADING | contexts.COMMENT | contexts.TAG)
+        double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
+
+        if push:
+            self._push(context)
         while True:
             this = self._read()
-            unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
-                      contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
-                      contexts.TAG_CLOSE)
             if self._context & unsafe:
                 if not self._verify_safe(this):
-                    double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
-                    if self._context & double:
+                    if self._context & double_fail:
                         self._pop()
                     self._fail_route()
             if this not in self.MARKERS:
-                if self._context & contexts.TAG_OPEN:
-                    should_exit = self._handle_tag_chunk(this)
-                    if should_exit:
-                        return should_exit
-                else:
-                    self._write_text(this)
+                self._write_text(this)
                 self._head += 1
                 continue
             if this is self.END:
-                fail = (
-                    contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
-                    contexts.HEADING | contexts.COMMENT | contexts.TAG)
                 if self._context & fail:
-                    double_fail = (
-                        contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
-                        contexts.TAG_OPEN_ATTR_QUOTED)
                     if self._context & double_fail:
                         self._pop()
                     self._fail_route()
@@ -720,8 +711,6 @@ class Tokenizer(object):
             elif this == "=" and not self._global & contexts.GL_HEADING:
                 if self._read(-1) in ("\n", self.START):
                     self._parse_heading()
-                elif self._context & contexts.TAG_OPEN_ATTR_NAME:
-                    self._handle_tag_attribute_body()
                 else:
                     self._write_text("=")
             elif this == "=" and self._context & contexts.HEADING:
@@ -735,22 +724,8 @@ class Tokenizer(object):
                     self._parse_comment()
                 else:
                     self._write_text(this)
-            elif this == "<" and next != "/" and (
-                    not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
+            elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE:
                 self._parse_tag()
-            elif self._context & contexts.TAG_OPEN:
-                if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
-                    self._handle_tag_chunk(this)
-                elif this == "\n":
-                    self._fail_route()
-                elif this == ">":
-                    self._handle_tag_close_open()
-                elif this == "/" and next == ">":
-                    return self._handle_tag_selfclose()
-                elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
-                    self._handle_tag_attribute_body()
-                else:
-                    self._handle_tag_chunk(this)
             elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
                 self._handle_tag_open_close()
             elif this == ">" and self._context & contexts.TAG_CLOSE:

From 962adcd62c48a426750fd637cfa27a2d74943474 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 30 Jun 2013 22:27:44 -0400
Subject: [PATCH 28/77] Add docstrings for a couple new methods in the
 tokenizer.

---
 mwparserfromhell/parser/tokenizer.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index a7b9e16..9817bd9 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -475,6 +475,11 @@ class Tokenizer(object):
             self._head += 1
 
     def _handle_tag_chunk(self, data, chunk):
+        """Handle a *chunk* of text inside a HTML open tag.
+
+        A "chunk" is either a marker, whitespace, or text containing no markers
+        or whitespace. *data* is a :py:class:`_TagOpenData` object.
+        """
         if not chunk:
             return
         if data.context & data.CX_NAME:
@@ -546,6 +551,10 @@ class Tokenizer(object):
                 self._write_text(chunk)                        ### hook on here for {, <, etc
 
     def _push_tag_buffer(self, data):
+        """Write a pending tag attribute from *data* to the stack.
+
+        *data* is a :py:class:`_TagOpenData` object.
+        """
         buf = data.padding_buffer
         while len(buf) < 3:
             buf.append("")

From 43e717cca927009c840ddabb3ebabad834d14adf Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 30 Jun 2013 22:41:19 -0400
Subject: [PATCH 29/77] Add a number of new tag tests.

---
 tests/tokenizer/tags.mwtest | 70 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 1dfc1b1..7d5f338 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -96,6 +96,76 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" "
 
 ---
 
+name:   invalid_space_begin_open
+label:  invalid tag: a space at the beginning of the open tag
+input:  "< ref>test</ref>"
+output: [Text(text="< ref>test</ref>")]
+
+---
+
+name:   invalid_space_begin_close
+label:  invalid tag: a space at the beginning of the close tag
+input:  "<ref>test</ ref>"
+output: [Text(text="<ref>test</ ref>")]
+
+---
+
+name:   valid_space_end
+label:  valid tag: spaces at the ends of both the open and close tags
+input:  "<ref >test</ref >"
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), Text(text="test"), TagOpenClose(), Text(text="ref "), TagCloseClose()]
+
+---
+
+name:   invalid_template_ends
+label:  invalid tag: a template at the ends of both the open and close tags
+input:  "<ref {{foo}}>test</ref {{foo}}>"
+output: [Text(text="<ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">")]
+
+---
+
+name:   invalid_template_ends_nospace
+label:  invalid tag: a template at the ends of both the open and close tags, without spacing
+input:  "<ref {{foo}}>test</ref{{foo}}>"
+output: [Text(text="<ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">")]
+
+---
+
+name:   valid_template_end_open
+label:  valid tag: a template at the end of the open tag
+input:  "<ref {{foo}}>test</ref>"
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   valid_template_end_open_space_end_close
+label:  valid tag: a template at the end of the open tag; whitespace at the end of the close tag
+input:  "<ref {{foo}}>test</ref\n>"
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref\n"), TagCloseClose()]
+
+---
+
+name:   invalid_template_end_open_nospace
+label:  invalid tag: a template at the end of the open tag, without spacing
+input:  "<ref{{foo}}>test</ref>"
+output: [Text(text="<ref"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref>")]
+
+---
+
+name:   invalid_template_start_close
+label:  invalid tag: a template at the beginning of the close tag
+input:  "<ref>test</{{foo}}ref>"
+output: [Text(text="<ref>test</"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text="ref>")]
+
+---
+
+name:   invalid_template_start_open
+label:  invalid tag: a template at the beginning of the open tag
+input:  "<{{foo}}ref>test</ref>"
+output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text="ref>test</ref>")]
+
+---
+
 name:   incomplete_lbracket
 label:  incomplete tags: just a left bracket
 input:  "<"

From 82edc93bbbd1786015a8c61521fd4f698b19724a Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 30 Jun 2013 22:42:26 -0400
Subject: [PATCH 30/77] Pass some tests by simplifying the way tags are read
 from the stack.

Two still fail because templates aren't implemented yet, but those
are otherwise handled correctly.
---
 mwparserfromhell/parser/tokenizer.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 9817bd9..8c91e4f 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -568,18 +568,6 @@ class Tokenizer(object):
         data.padding_buffer, data.quote_buffer = [], []
         data.ignore_quote = False
 
-    def _get_tag_from_stack(self, stack=None):
-        """Return the tag based on the text in *stack*."""
-        if not stack:
-            sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen)
-            pred = lambda tok: not isinstance(tok, sentinels)
-            stack = takewhile(pred, self._stack)
-        text = [tok.text for tok in stack if isinstance(tok, tokens.Text)]
-        try:
-            return "".join(text).rstrip().lower().split()[0]
-        except IndexError:
-            self._fail_route()
-
     def _handle_tag_open_close(self):
         """Handle the opening of a closing tag (``</foo>``)."""
         self._write(tokens.TagOpenClose())
@@ -588,8 +576,10 @@ class Tokenizer(object):
 
     def _handle_tag_close_close(self):
         """Handle the ending of a closing tag (``</foo>``)."""
+        strip = lambda tok: tok.text.rstrip().lower()
         closing = self._pop()
-        if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
+        if len(closing) != 1 or (not isinstance(closing[0], tokens.Text) or
+                                 strip(closing[0]) != strip(self._stack[1])):
             self._fail_route()
         self._write_all(closing)
         self._write(tokens.TagCloseClose())
@@ -625,7 +615,7 @@ class Tokenizer(object):
                 self._context |= contexts.HAS_TEXT
             return True
         elif context & contexts.TAG_CLOSE:
-            return this != "<" and this != "\n"
+            return this != "<"
         else:
             if context & contexts.FAIL_ON_EQUALS:
                 if this == "=":

From f63480bcf3a21b8eb61c944f30b79d04a04efe40 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 30 Jun 2013 23:48:58 -0400
Subject: [PATCH 31/77] Update the integration.rich_tags test to use the new
 tag tokens.

Remove an now-unused import in the tokenizer.
---
 mwparserfromhell/parser/tokenizer.py | 1 -
 tests/tokenizer/integration.mwtest   | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 8c91e4f..9207440 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -21,7 +21,6 @@
 # SOFTWARE.
 
 from __future__ import unicode_literals
-from itertools import takewhile
 from math import log
 import re
 
diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest
index ba01c8c..736ecb1 100644
--- a/tests/tokenizer/integration.mwtest
+++ b/tests/tokenizer/integration.mwtest
@@ -36,7 +36,7 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t
 name:   rich_tags
 label:  a HTML tag with tons of other things in it
 input:  "{{dubious claim}}<ref name={{abc}}  foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} mno="{{p}} [[q]] {{r}}">[[Source]]</ref>"
-output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(padding=" "), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(padding=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(padding=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(padding=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagOpenClose(), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 

From dfe100ceb7eecec82d6a3af98d016dfd95d3f9ea Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Mon, 1 Jul 2013 20:44:56 -0400
Subject: [PATCH 32/77] Support templates and wikilinks inside <open> tags
 (part 1)

---
 mwparserfromhell/parser/tokenizer.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 9207440..21d0f2a 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -221,6 +221,8 @@ class Tokenizer(object):
                 self._head += 1
 
         self._write_all(self._pop())
+        if self._context & contexts.FAIL_NEXT:
+            self._context ^= contexts.FAIL_NEXT
 
     def _parse_template(self):
         """Parse a template at the head of the wikicode string."""
@@ -293,6 +295,8 @@ class Tokenizer(object):
             self._head = reset
             self._write_text("[[")
         else:
+            if self._context & contexts.FAIL_NEXT:
+                self._context ^= contexts.FAIL_NEXT
             self._write(tokens.WikilinkOpen())
             self._write_all(wikilink)
             self._write(tokens.WikilinkClose())
@@ -507,7 +511,7 @@ class Tokenizer(object):
             else:
                 data.context = data.CX_ATTR_NAME
                 self._push(contexts.TAG_ATTR)
-                self._write_text(chunk)                        ### hook on here for {, <, etc
+                self._parse_tag_chunk(chunk)
         elif data.context & data.CX_ATTR_NAME:
             if chunk.isspace():
                 data.padding_buffer.append(chunk)
@@ -523,7 +527,7 @@ class Tokenizer(object):
                     data.padding_buffer.append("")  # No padding before tag
                     data.context = data.CX_ATTR_NAME
                     self._push(contexts.TAG_ATTR)
-                self._write_text(chunk)                        ### hook on here for {, <, etc
+                self._parse_tag_chunk(chunk)
         elif data.context & data.CX_ATTR_VALUE:
             ### handle backslashes here
             if data.context & data.CX_NEED_QUOTE:
@@ -535,7 +539,7 @@ class Tokenizer(object):
                     data.padding_buffer.append(chunk)
                 else:
                     data.context ^= data.CX_NEED_QUOTE
-                    self._write_text(chunk)                    ### hook on here for {, <, etc
+                    self._parse_tag_chunk(chunk)
             elif not data.literal:
                 if chunk == '"':
                     data.context |= data.CX_NEED_SPACE
@@ -547,7 +551,18 @@ class Tokenizer(object):
                 data.padding_buffer.append(chunk)
                 data.context = data.CX_ATTR_READY
             else:
-                self._write_text(chunk)                        ### hook on here for {, <, etc
+                self._parse_tag_chunk(chunk)
+
+    def _parse_tag_chunk(self, chunk):
+        next = self._read(1)
+        if not self._can_recurse() or chunk not in self.MARKERS:
+            self._write_text(chunk)
+        elif chunk == next == "{":
+            self._parse_template_or_argument()
+        elif chunk == next == "[":
+            self._parse_wikilink()
+        else:
+            self._write_text(chunk)
 
     def _push_tag_buffer(self, data):
         """Write a pending tag attribute from *data* to the stack.
@@ -678,8 +693,6 @@ class Tokenizer(object):
             elif this == next == "{":
                 if self._can_recurse():
                     self._parse_template_or_argument()
-                    if self._context & contexts.FAIL_NEXT:
-                        self._context ^= contexts.FAIL_NEXT
                 else:
                     self._write_text("{")
             elif this == "|" and self._context & contexts.TEMPLATE:
@@ -698,8 +711,6 @@ class Tokenizer(object):
             elif this == next == "[":
                 if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
                     self._parse_wikilink()
-                    if self._context & contexts.FAIL_NEXT:
-                        self._context ^= contexts.FAIL_NEXT
                 else:
                     self._write_text("[")
             elif this == "|" and self._context & contexts.WIKILINK_TITLE:

From e34026dabe359ffd16567c8c5002d76f4981fe57 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Mon, 1 Jul 2013 22:14:57 -0400
Subject: [PATCH 33/77] Support templates and wikilinks inside <open> tags
 (part 2)

---
 mwparserfromhell/parser/tokenizer.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 21d0f2a..29c2772 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -51,7 +51,7 @@ class _TagOpenData(object):
         self.context = self.CX_NAME
         self.literal = True
         self.padding_buffer = []
-        self.quote_buffer = []
+        self.quoted = False
         self.reset = 0
         self.ignore_quote = False
 
@@ -454,6 +454,8 @@ class Tokenizer(object):
                         continue
             elif this is self.END:
                 if self._context & contexts.TAG_ATTR:
+                    if data.quoted:
+                        self._pop()
                     self._pop()
                 self._fail_route()
             elif this == ">" and data.literal:
@@ -499,8 +501,9 @@ class Tokenizer(object):
             else:
                 if data.context & data.CX_ATTR_VALUE:
                     data.context ^= data.CX_NEED_SPACE
-                    data.quote_buffer = []
+                    data.quoted = False
                     data.ignore_quote = True
+                    self._pop()
                     self._head = data.reset
                     return True  # Break out of chunk processing early
                 else:
@@ -534,6 +537,8 @@ class Tokenizer(object):
                 if chunk == '"' and not data.ignore_quote:
                     data.context ^= data.CX_NEED_QUOTE
                     data.literal = False
+                    data.quoted = True
+                    self._push(self._context)
                     data.reset = self._head
                 elif chunk.isspace():
                     data.padding_buffer.append(chunk)
@@ -545,7 +550,7 @@ class Tokenizer(object):
                     data.context |= data.CX_NEED_SPACE
                     data.literal = True
                 else:
-                    data.quote_buffer.append(chunk)
+                    self._parse_tag_chunk(chunk)
             elif chunk.isspace():
                 self._push_tag_buffer(data)
                 data.padding_buffer.append(chunk)
@@ -572,14 +577,15 @@ class Tokenizer(object):
         buf = data.padding_buffer
         while len(buf) < 3:
             buf.append("")
+        if data.quoted:
+            data.quoted = False
+            self._write_first(tokens.TagAttrQuote())
+            self._write_all(self._pop())
         self._write_first(tokens.TagAttrStart(
             pad_after_eq=buf.pop(), pad_before_eq=buf.pop(),
             pad_first=buf.pop()))
-        if data.quote_buffer:
-            self._write(tokens.TagAttrQuote())
-            self._write_text("".join(data.quote_buffer))
         self._write_all(self._pop())
-        data.padding_buffer, data.quote_buffer = [], []
+        data.padding_buffer = []
         data.ignore_quote = False
 
     def _handle_tag_open_close(self):

From 9693b6d5e61571dfd1e0ea3a65fb95a46dcad1c7 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 2 Jul 2013 00:48:20 -0400
Subject: [PATCH 34/77] Replace data.literal and data.quoted with a
 data.CX_QUOTED context

---
 mwparserfromhell/parser/tokenizer.py | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 29c2772..129c19a 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -42,16 +42,15 @@ class _TagOpenData(object):
     CX_ATTR_READY =  1 << 1
     CX_ATTR_NAME =   1 << 2
     CX_ATTR_VALUE =  1 << 3
-    CX_NEED_SPACE =  1 << 4
-    CX_NEED_EQUALS = 1 << 5
-    CX_NEED_QUOTE =  1 << 6
+    CX_QUOTED =      1 << 4
+    CX_NEED_SPACE =  1 << 5
+    CX_NEED_EQUALS = 1 << 6
+    CX_NEED_QUOTE =  1 << 7
     CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE
 
     def __init__(self):
         self.context = self.CX_NAME
-        self.literal = True
         self.padding_buffer = []
-        self.quoted = False
         self.reset = 0
         self.ignore_quote = False
 
@@ -448,17 +447,18 @@ class Tokenizer(object):
         self._write(tokens.TagOpenOpen(showtag=True))
         while True:
             this, next = self._read(), self._read(1)
+            can_exit = not data.context & data.CX_QUOTED or data.context & data.CX_NEED_SPACE
             if this not in self.MARKERS:
                 for chunk in self.tag_splitter.split(this):
                     if self._handle_tag_chunk(data, chunk):
                         continue
             elif this is self.END:
                 if self._context & contexts.TAG_ATTR:
-                    if data.quoted:
+                    if data.context & data.CX_QUOTED:
                         self._pop()
                     self._pop()
                 self._fail_route()
-            elif this == ">" and data.literal:
+            elif this == ">" and can_exit:
                 if data.context & data.CX_ATTR:
                     self._push_tag_buffer(data)
                 padding = data.padding_buffer[0] if data.padding_buffer else ""
@@ -466,7 +466,7 @@ class Tokenizer(object):
                 self._context = contexts.TAG_BODY
                 self._head += 1
                 return self._parse(push=False)
-            elif this == "/" and next == ">" and data.literal:
+            elif this == "/" and next == ">" and can_exit:
                 if data.context & data.CX_ATTR:
                     self._push_tag_buffer(data)
                 padding = data.padding_buffer[0] if data.padding_buffer else ""
@@ -499,9 +499,8 @@ class Tokenizer(object):
                 data.padding_buffer.append(chunk)
                 data.context = data.CX_ATTR_READY
             else:
-                if data.context & data.CX_ATTR_VALUE:
-                    data.context ^= data.CX_NEED_SPACE
-                    data.quoted = False
+                if data.context & data.CX_QUOTED:
+                    data.context ^= data.CX_NEED_SPACE | data.CX_QUOTED
                     data.ignore_quote = True
                     self._pop()
                     self._head = data.reset
@@ -536,8 +535,7 @@ class Tokenizer(object):
             if data.context & data.CX_NEED_QUOTE:
                 if chunk == '"' and not data.ignore_quote:
                     data.context ^= data.CX_NEED_QUOTE
-                    data.literal = False
-                    data.quoted = True
+                    data.context |= data.CX_QUOTED
                     self._push(self._context)
                     data.reset = self._head
                 elif chunk.isspace():
@@ -545,10 +543,9 @@ class Tokenizer(object):
                 else:
                     data.context ^= data.CX_NEED_QUOTE
                     self._parse_tag_chunk(chunk)
-            elif not data.literal:
+            elif data.context & data.CX_QUOTED:
                 if chunk == '"':
                     data.context |= data.CX_NEED_SPACE
-                    data.literal = True
                 else:
                     self._parse_tag_chunk(chunk)
             elif chunk.isspace():
@@ -574,13 +571,12 @@ class Tokenizer(object):
 
         *data* is a :py:class:`_TagOpenData` object.
         """
+        if data.context & data.CX_QUOTED:
+            self._write_first(tokens.TagAttrQuote())
+            self._write_all(self._pop())
         buf = data.padding_buffer
         while len(buf) < 3:
             buf.append("")
-        if data.quoted:
-            data.quoted = False
-            self._write_first(tokens.TagAttrQuote())
-            self._write_all(self._pop())
         self._write_first(tokens.TagAttrStart(
             pad_after_eq=buf.pop(), pad_before_eq=buf.pop(),
             pad_first=buf.pop()))

From dd6bb1637d26fb26085143dd6c13be310d1b04bc Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 2 Jul 2013 01:31:28 -0400
Subject: [PATCH 35/77] Support tag nesting properly; unit tests; recursion
 checks for tags.

---
 mwparserfromhell/parser/tokenizer.py | 16 +++++++++++-----
 tests/tokenizer/tags.mwtest          | 28 ++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 129c19a..2d1245f 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -447,7 +447,8 @@ class Tokenizer(object):
         self._write(tokens.TagOpenOpen(showtag=True))
         while True:
             this, next = self._read(), self._read(1)
-            can_exit = not data.context & data.CX_QUOTED or data.context & data.CX_NEED_SPACE
+            can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
+                        data.context & data.CX_NEED_SPACE)
             if this not in self.MARKERS:
                 for chunk in self.tag_splitter.split(this):
                     if self._handle_tag_chunk(data, chunk):
@@ -488,8 +489,8 @@ class Tokenizer(object):
         if not chunk:
             return
         if data.context & data.CX_NAME:
-            if chunk != chunk.lstrip():  # Tags cannot start with whitespace
-                self._fail_route()
+            if chunk in self.MARKERS or chunk.isspace():
+                self._fail_route()  # Tags must start with text (not a space)
             self._write_text(chunk)
             data.context = data.CX_NEED_SPACE
         elif data.context & data.CX_NEED_SPACE:
@@ -563,6 +564,8 @@ class Tokenizer(object):
             self._parse_template_or_argument()
         elif chunk == next == "[":
             self._parse_wikilink()
+        elif chunk == "<":
+            self._parse_tag()
         else:
             self._write_text(chunk)
 
@@ -735,10 +738,13 @@ class Tokenizer(object):
                     self._parse_comment()
                 else:
                     self._write_text(this)
-            elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE:
-                self._parse_tag()
             elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
                 self._handle_tag_open_close()
+            elif this == "<":
+                if not self._context & contexts.TAG_CLOSE and self._can_recurse():
+                    self._parse_tag()
+                else:
+                    self._write_text("<")
             elif this == ">" and self._context & contexts.TAG_CLOSE:
                 return self._handle_tag_close_close()
             else:
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 7d5f338..17010e9 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -96,6 +96,34 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" "
 
 ---
 
+name:   nested_tag
+label:  a tag nested within the attributes of another
+input:  "<ref name=<span style="color: red;">foo</span>>citation</ref>"
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   nested_tag_quoted
+label:  a tag nested within the attributes of another, quoted
+input:  "<ref name="<span style="color: red;">foo</span>">citation</ref>"
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name:   nested_troll_tag
+label:  a bogus tag that appears to be nested within the attributes of another
+input:  "<ref name=</ ><//>>citation</ref>"
+output: [Text(text="<ref name=</ ><//>>citation</ref>")]
+
+---
+
+name:   nested_troll_tag_quoted
+label:  a bogus tag that appears to be nested within the attributes of another, quoted
+input:  "<ref name="</ ><//>">citation</ref>"
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="</ ><//>"), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
 name:   invalid_space_begin_open
 label:  invalid tag: a space at the beginning of the open tag
 input:  "< ref>test</ref>"

From 5e8794da5eff96fc649956283e5e115582ade86d Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 2 Jul 2013 20:04:28 -0400
Subject: [PATCH 36/77] Refactor more of the tag tokenization process.

---
 mwparserfromhell/parser/tokenizer.py | 39 +++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 2d1245f..084d94b 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -449,30 +449,18 @@ class Tokenizer(object):
             this, next = self._read(), self._read(1)
             can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
                         data.context & data.CX_NEED_SPACE)
-            if this not in self.MARKERS:
-                for chunk in self.tag_splitter.split(this):
-                    if self._handle_tag_chunk(data, chunk):
-                        continue
-            elif this is self.END:
+            if this is self.END:
                 if self._context & contexts.TAG_ATTR:
                     if data.context & data.CX_QUOTED:
                         self._pop()
                     self._pop()
                 self._fail_route()
             elif this == ">" and can_exit:
-                if data.context & data.CX_ATTR:
-                    self._push_tag_buffer(data)
-                padding = data.padding_buffer[0] if data.padding_buffer else ""
-                self._write(tokens.TagCloseOpen(padding=padding))
+                self._handle_tag_close_open(data, tokens.TagCloseOpen)
                 self._context = contexts.TAG_BODY
-                self._head += 1
                 return self._parse(push=False)
             elif this == "/" and next == ">" and can_exit:
-                if data.context & data.CX_ATTR:
-                    self._push_tag_buffer(data)
-                padding = data.padding_buffer[0] if data.padding_buffer else ""
-                self._write(tokens.TagCloseSelfclose(padding=padding))
-                self._head += 1
+                self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
                 return self._pop()
             else:
                 for chunk in self.tag_splitter.split(this):
@@ -514,7 +502,7 @@ class Tokenizer(object):
             else:
                 data.context = data.CX_ATTR_NAME
                 self._push(contexts.TAG_ATTR)
-                self._parse_tag_chunk(chunk)
+                self._parse_text_in_tag(chunk)
         elif data.context & data.CX_ATTR_NAME:
             if chunk.isspace():
                 data.padding_buffer.append(chunk)
@@ -530,7 +518,7 @@ class Tokenizer(object):
                     data.padding_buffer.append("")  # No padding before tag
                     data.context = data.CX_ATTR_NAME
                     self._push(contexts.TAG_ATTR)
-                self._parse_tag_chunk(chunk)
+                self._parse_text_in_tag(chunk)
         elif data.context & data.CX_ATTR_VALUE:
             ### handle backslashes here
             if data.context & data.CX_NEED_QUOTE:
@@ -543,20 +531,21 @@ class Tokenizer(object):
                     data.padding_buffer.append(chunk)
                 else:
                     data.context ^= data.CX_NEED_QUOTE
-                    self._parse_tag_chunk(chunk)
+                    self._parse_text_in_tag(chunk)
             elif data.context & data.CX_QUOTED:
                 if chunk == '"':
                     data.context |= data.CX_NEED_SPACE
                 else:
-                    self._parse_tag_chunk(chunk)
+                    self._parse_text_in_tag(chunk)
             elif chunk.isspace():
                 self._push_tag_buffer(data)
                 data.padding_buffer.append(chunk)
                 data.context = data.CX_ATTR_READY
             else:
-                self._parse_tag_chunk(chunk)
+                self._parse_text_in_tag(chunk)
 
-    def _parse_tag_chunk(self, chunk):
+    def _parse_text_in_tag(self, chunk):
+        """Parse a chunk of text in a tag that has no special significance."""
         next = self._read(1)
         if not self._can_recurse() or chunk not in self.MARKERS:
             self._write_text(chunk)
@@ -587,6 +576,14 @@ class Tokenizer(object):
         data.padding_buffer = []
         data.ignore_quote = False
 
+    def _handle_tag_close_open(self, data, token):
+        """Handle the closing of a open tag (``<foo>``)."""
+        if data.context & data.CX_ATTR:
+            self._push_tag_buffer(data)
+        padding = data.padding_buffer[0] if data.padding_buffer else ""
+        self._write(token(padding=padding))
+        self._head += 1
+
     def _handle_tag_open_close(self):
         """Handle the opening of a closing tag (``</foo>``)."""
         self._write(tokens.TagOpenClose())

From e99c9d3038a64c71981fcd9783e2ab3a21f846c6 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 3 Jul 2013 18:29:07 -0400
Subject: [PATCH 37/77] More tag refactoring; fix some bugs.

---
 mwparserfromhell/parser/tokenizer.py | 176 ++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 96 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 084d94b..5bb7059 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -46,13 +46,11 @@ class _TagOpenData(object):
     CX_NEED_SPACE =  1 << 5
     CX_NEED_EQUALS = 1 << 6
     CX_NEED_QUOTE =  1 << 7
-    CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE
 
     def __init__(self):
         self.context = self.CX_NAME
         self.padding_buffer = []
         self.reset = 0
-        self.ignore_quote = False
 
 
 class Tokenizer(object):
@@ -452,7 +450,11 @@ class Tokenizer(object):
             if this is self.END:
                 if self._context & contexts.TAG_ATTR:
                     if data.context & data.CX_QUOTED:
+                        # Unclosed attribute quote: reset, don't die
+                        data.context = data.CX_ATTR_VALUE
                         self._pop()
+                        self._head = data.reset
+                        continue
                     self._pop()
                 self._fail_route()
             elif this == ">" and can_exit:
@@ -463,122 +465,104 @@ class Tokenizer(object):
                 self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
                 return self._pop()
             else:
-                for chunk in self.tag_splitter.split(this):
-                    if self._handle_tag_chunk(data, chunk):
-                        continue
+                self._handle_tag_data(data, this)
             self._head += 1
 
-    def _handle_tag_chunk(self, data, chunk):
-        """Handle a *chunk* of text inside a HTML open tag.
+    def _push_tag_buffer(self, data):
+        """Write a pending tag attribute from *data* to the stack."""
+        if data.context & data.CX_QUOTED:
+            self._write_first(tokens.TagAttrQuote())
+            self._write_all(self._pop())
+        buf = data.padding_buffer
+        while len(buf) < 3:
+            buf.append("")
+        self._write_first(tokens.TagAttrStart(pad_after_eq=buf.pop(),
+            pad_before_eq=buf.pop(), pad_first=buf.pop()))
+        self._write_all(self._pop())
+        data.padding_buffer = []
 
-        A "chunk" is either a marker, whitespace, or text containing no markers
-        or whitespace. *data* is a :py:class:`_TagOpenData` object.
-        """
-        if not chunk:
-            return
-        if data.context & data.CX_NAME:
-            if chunk in self.MARKERS or chunk.isspace():
-                self._fail_route()  # Tags must start with text (not a space)
-            self._write_text(chunk)
-            data.context = data.CX_NEED_SPACE
-        elif data.context & data.CX_NEED_SPACE:
-            if chunk.isspace():
-                if data.context & data.CX_ATTR_VALUE:
-                    self._push_tag_buffer(data)
-                data.padding_buffer.append(chunk)
-                data.context = data.CX_ATTR_READY
-            else:
+    def _handle_tag_data(self, data, text):
+        """Handle all sorts of *text* data inside of an HTML open tag."""
+        for chunk in self.tag_splitter.split(text):
+            if not chunk:
+                continue
+            if data.context & data.CX_NAME:
+                if chunk in self.MARKERS or chunk.isspace():
+                    self._fail_route()  # Tags must start with text, not spaces
+                data.context = data.CX_NEED_SPACE
+            elif chunk.isspace():
+                self._handle_tag_space(data, chunk)
+                continue
+            elif data.context & data.CX_NEED_SPACE:
                 if data.context & data.CX_QUOTED:
-                    data.context ^= data.CX_NEED_SPACE | data.CX_QUOTED
-                    data.ignore_quote = True
+                    data.context = data.CX_ATTR_VALUE
                     self._pop()
-                    self._head = data.reset
-                    return True  # Break out of chunk processing early
-                else:
-                    self._fail_route()
-        elif data.context & data.CX_ATTR_READY:
-            if chunk.isspace():
-                data.padding_buffer.append(chunk)
-            else:
+                    self._head = data.reset - 1  # Will be auto-incremented
+                    return  # Break early
+                self._fail_route()
+            elif data.context & data.CX_ATTR_READY:
                 data.context = data.CX_ATTR_NAME
                 self._push(contexts.TAG_ATTR)
-                self._parse_text_in_tag(chunk)
-        elif data.context & data.CX_ATTR_NAME:
-            if chunk.isspace():
-                data.padding_buffer.append(chunk)
-                data.context |= data.CX_NEED_EQUALS
-            elif chunk == "=":
-                if not data.context & data.CX_NEED_EQUALS:
-                    data.padding_buffer.append("")  # No padding before equals
-                data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE
-                self._write(tokens.TagAttrEquals())
-            else:
+            elif data.context & data.CX_ATTR_NAME:
+                if chunk == "=":
+                    if not data.context & data.CX_NEED_EQUALS:
+                        data.padding_buffer.append("")  # No padding before '='
+                    data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE
+                    self._write(tokens.TagAttrEquals())
+                    continue
                 if data.context & data.CX_NEED_EQUALS:
                     self._push_tag_buffer(data)
                     data.padding_buffer.append("")  # No padding before tag
                     data.context = data.CX_ATTR_NAME
                     self._push(contexts.TAG_ATTR)
-                self._parse_text_in_tag(chunk)
-        elif data.context & data.CX_ATTR_VALUE:
-            ### handle backslashes here
-            if data.context & data.CX_NEED_QUOTE:
-                if chunk == '"' and not data.ignore_quote:
+            elif data.context & data.CX_ATTR_VALUE:
+                ### handle backslashes here
+                if data.context & data.CX_NEED_QUOTE:
                     data.context ^= data.CX_NEED_QUOTE
-                    data.context |= data.CX_QUOTED
-                    self._push(self._context)
-                    data.reset = self._head
-                elif chunk.isspace():
-                    data.padding_buffer.append(chunk)
-                else:
-                    data.context ^= data.CX_NEED_QUOTE
-                    self._parse_text_in_tag(chunk)
-            elif data.context & data.CX_QUOTED:
-                if chunk == '"':
-                    data.context |= data.CX_NEED_SPACE
-                else:
-                    self._parse_text_in_tag(chunk)
-            elif chunk.isspace():
-                self._push_tag_buffer(data)
-                data.padding_buffer.append(chunk)
-                data.context = data.CX_ATTR_READY
-            else:
-                self._parse_text_in_tag(chunk)
+                    if chunk == '"':
+                        data.context |= data.CX_QUOTED
+                        self._push(self._context)
+                        data.reset = self._head
+                        continue
+                elif data.context & data.CX_QUOTED:
+                    if chunk == '"':
+                        data.context |= data.CX_NEED_SPACE
+                        continue
+            self._handle_tag_text(chunk)
 
-    def _parse_text_in_tag(self, chunk):
-        """Parse a chunk of text in a tag that has no special significance."""
+    def _handle_tag_space(self, data, text):
+        """Handle whitespace (*text*) inside of an HTML open tag."""
+        ctx = data.context
+        end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NEED_QUOTE)
+        if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NEED_SPACE):
+            self._push_tag_buffer(data)
+            data.context = data.CX_ATTR_READY
+        elif ctx & data.CX_NEED_SPACE:
+            data.context = data.CX_ATTR_READY
+        elif ctx & data.CX_ATTR_NAME:
+            data.context |= data.CX_NEED_EQUALS
+        if ctx & data.CX_QUOTED and not ctx & data.CX_NEED_SPACE:
+            self._write_text(text)
+        else:
+            data.padding_buffer.append(text)
+
+    def _handle_tag_text(self, text):
+        """Handle regular *text* inside of an HTML open tag."""
         next = self._read(1)
-        if not self._can_recurse() or chunk not in self.MARKERS:
-            self._write_text(chunk)
-        elif chunk == next == "{":
+        if not self._can_recurse() or text not in self.MARKERS:
+            self._write_text(text)
+        elif text == next == "{":
             self._parse_template_or_argument()
-        elif chunk == next == "[":
+        elif text == next == "[":
             self._parse_wikilink()
-        elif chunk == "<":
+        elif text == "<":
             self._parse_tag()
         else:
-            self._write_text(chunk)
-
-    def _push_tag_buffer(self, data):
-        """Write a pending tag attribute from *data* to the stack.
-
-        *data* is a :py:class:`_TagOpenData` object.
-        """
-        if data.context & data.CX_QUOTED:
-            self._write_first(tokens.TagAttrQuote())
-            self._write_all(self._pop())
-        buf = data.padding_buffer
-        while len(buf) < 3:
-            buf.append("")
-        self._write_first(tokens.TagAttrStart(
-            pad_after_eq=buf.pop(), pad_before_eq=buf.pop(),
-            pad_first=buf.pop()))
-        self._write_all(self._pop())
-        data.padding_buffer = []
-        data.ignore_quote = False
+            self._write_text(text)
 
     def _handle_tag_close_open(self, data, token):
         """Handle the closing of a open tag (``<foo>``)."""
-        if data.context & data.CX_ATTR:
+        if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
             self._push_tag_buffer(data)
         padding = data.padding_buffer[0] if data.padding_buffer else ""
         self._write(token(padding=padding))

From 17c71e335f35b3c10e572daeaf2cb2c6707ea000 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 3 Jul 2013 18:30:02 -0400
Subject: [PATCH 38/77] Add three tests for invalid attribute quote usage.

---
 tests/tokenizer/tags.mwtest | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 17010e9..89b2b2e 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -194,6 +194,27 @@ output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text
 
 ---
 
+name:   unclosed_quote
+label:  a quoted attribute that is never closed
+input:  "<span style="foobar>stuff</span>"
+output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foobar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
+
+---
+
+name:   fake_quote
+label:  a fake quoted attribute
+input:  "<span style="foo"bar>stuff</span>"
+output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo\"bar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
+
+---
+
+name:   fake_quote_complex
+label:  a fake quoted attribute, with spaces and templates and links
+input:  "<span style="foo {{bar}}\n[[baz]]"buzz >stuff</span>"
+output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo"), TagAttrStart(pad_first=" ", pad_before_eq="\n", pad_after_eq=""), TemplateOpen(), Text(text="bar"), TemplateClose(), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), WikilinkOpen(), Text(text="baz"), WikilinkClose(), Text(text="\"buzz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
+
+---
+
 name:   incomplete_lbracket
 label:  incomplete tags: just a left bracket
 input:  "<"

From 591a0f5ed57f3ccad221a2870749031064003c5c Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 3 Jul 2013 18:46:41 -0400
Subject: [PATCH 39/77] Change 'write' to 'emit'; adjust some other names for
 PEP8.

---
 mwparserfromhell/parser/tokenizer.py | 149 +++++++++++++++++------------------
 1 file changed, 74 insertions(+), 75 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 5bb7059..515a7a2 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -24,8 +24,7 @@ from __future__ import unicode_literals
 from math import log
 import re
 
-from . import contexts
-from . import tokens
+from . import contexts, tokens
 from ..compat import htmlentities
 from ..tag_defs import is_parsable
 
@@ -136,33 +135,33 @@ class Tokenizer(object):
         self._pop()
         raise BadRoute()
 
-    def _write(self, token):
+    def _emit(self, token):
         """Write a token to the end of the current token stack."""
         self._push_textbuffer()
         self._stack.append(token)
 
-    def _write_first(self, token):
+    def _emit_first(self, token):
         """Write a token to the beginning of the current token stack."""
         self._push_textbuffer()
         self._stack.insert(0, token)
 
-    def _write_text(self, text):
+    def _emit_text(self, text):
         """Write text to the current textbuffer."""
         self._textbuffer.append(text)
 
-    def _write_all(self, tokenlist):
+    def _emit_all(self, tokenlist):
         """Write a series of tokens to the current stack at once."""
         if tokenlist and isinstance(tokenlist[0], tokens.Text):
-            self._write_text(tokenlist.pop(0).text)
+            self._emit_text(tokenlist.pop(0).text)
         self._push_textbuffer()
         self._stack.extend(tokenlist)
 
-    def _write_text_then_stack(self, text):
+    def _emit_text_then_stack(self, text):
         """Pop the current stack, write *text*, and then write the stack."""
         stack = self._pop()
-        self._write_text(text)
+        self._emit_text(text)
         if stack:
-            self._write_all(stack)
+            self._emit_all(stack)
         self._head -= 1
 
     def _read(self, delta=0, wrap=False, strict=False):
@@ -198,12 +197,12 @@ class Tokenizer(object):
 
         while braces:
             if braces == 1:
-                return self._write_text_then_stack("{")
+                return self._emit_text_then_stack("{")
             if braces == 2:
                 try:
                     self._parse_template()
                 except BadRoute:
-                    return self._write_text_then_stack("{{")
+                    return self._emit_text_then_stack("{{")
                 break
             try:
                 self._parse_argument()
@@ -213,11 +212,11 @@ class Tokenizer(object):
                     self._parse_template()
                     braces -= 2
                 except BadRoute:
-                    return self._write_text_then_stack("{" * braces)
+                    return self._emit_text_then_stack("{" * braces)
             if braces:
                 self._head += 1
 
-        self._write_all(self._pop())
+        self._emit_all(self._pop())
         if self._context & contexts.FAIL_NEXT:
             self._context ^= contexts.FAIL_NEXT
 
@@ -229,9 +228,9 @@ class Tokenizer(object):
         except BadRoute:
             self._head = reset
             raise
-        self._write_first(tokens.TemplateOpen())
-        self._write_all(template)
-        self._write(tokens.TemplateClose())
+        self._emit_first(tokens.TemplateOpen())
+        self._emit_all(template)
+        self._emit(tokens.TemplateClose())
 
     def _parse_argument(self):
         """Parse an argument at the head of the wikicode string."""
@@ -241,9 +240,9 @@ class Tokenizer(object):
         except BadRoute:
             self._head = reset
             raise
-        self._write_first(tokens.ArgumentOpen())
-        self._write_all(argument)
-        self._write(tokens.ArgumentClose())
+        self._emit_first(tokens.ArgumentOpen())
+        self._emit_all(argument)
+        self._emit(tokens.ArgumentClose())
 
     def _handle_template_param(self):
         """Handle a template parameter at the head of the string."""
@@ -252,22 +251,22 @@ class Tokenizer(object):
         elif self._context & contexts.TEMPLATE_PARAM_VALUE:
             self._context ^= contexts.TEMPLATE_PARAM_VALUE
         elif self._context & contexts.TEMPLATE_PARAM_KEY:
-            self._write_all(self._pop(keep_context=True))
+            self._emit_all(self._pop(keep_context=True))
         self._context |= contexts.TEMPLATE_PARAM_KEY
-        self._write(tokens.TemplateParamSeparator())
+        self._emit(tokens.TemplateParamSeparator())
         self._push(self._context)
 
     def _handle_template_param_value(self):
         """Handle a template parameter's value at the head of the string."""
-        self._write_all(self._pop(keep_context=True))
+        self._emit_all(self._pop(keep_context=True))
         self._context ^= contexts.TEMPLATE_PARAM_KEY
         self._context |= contexts.TEMPLATE_PARAM_VALUE
-        self._write(tokens.TemplateParamEquals())
+        self._emit(tokens.TemplateParamEquals())
 
     def _handle_template_end(self):
         """Handle the end of a template at the head of the string."""
         if self._context & contexts.TEMPLATE_PARAM_KEY:
-            self._write_all(self._pop(keep_context=True))
+            self._emit_all(self._pop(keep_context=True))
         self._head += 1
         return self._pop()
 
@@ -275,7 +274,7 @@ class Tokenizer(object):
         """Handle the separator between an argument's name and default."""
         self._context ^= contexts.ARGUMENT_NAME
         self._context |= contexts.ARGUMENT_DEFAULT
-        self._write(tokens.ArgumentSeparator())
+        self._emit(tokens.ArgumentSeparator())
 
     def _handle_argument_end(self):
         """Handle the end of an argument at the head of the string."""
@@ -290,19 +289,19 @@ class Tokenizer(object):
             wikilink = self._parse(contexts.WIKILINK_TITLE)
         except BadRoute:
             self._head = reset
-            self._write_text("[[")
+            self._emit_text("[[")
         else:
             if self._context & contexts.FAIL_NEXT:
                 self._context ^= contexts.FAIL_NEXT
-            self._write(tokens.WikilinkOpen())
-            self._write_all(wikilink)
-            self._write(tokens.WikilinkClose())
+            self._emit(tokens.WikilinkOpen())
+            self._emit_all(wikilink)
+            self._emit(tokens.WikilinkClose())
 
     def _handle_wikilink_separator(self):
         """Handle the separator between a wikilink's title and its text."""
         self._context ^= contexts.WIKILINK_TITLE
         self._context |= contexts.WIKILINK_TEXT
-        self._write(tokens.WikilinkSeparator())
+        self._emit(tokens.WikilinkSeparator())
 
     def _handle_wikilink_end(self):
         """Handle the end of a wikilink at the head of the string."""
@@ -324,13 +323,13 @@ class Tokenizer(object):
             title, level = self._parse(context)
         except BadRoute:
             self._head = reset + best - 1
-            self._write_text("=" * best)
+            self._emit_text("=" * best)
         else:
-            self._write(tokens.HeadingStart(level=level))
+            self._emit(tokens.HeadingStart(level=level))
             if level < best:
-                self._write_text("=" * (best - level))
-            self._write_all(title)
-            self._write(tokens.HeadingEnd())
+                self._emit_text("=" * (best - level))
+            self._emit_all(title)
+            self._emit(tokens.HeadingEnd())
         finally:
             self._global ^= contexts.GL_HEADING
 
@@ -349,28 +348,28 @@ class Tokenizer(object):
             after, after_level = self._parse(self._context)
         except BadRoute:
             if level < best:
-                self._write_text("=" * (best - level))
+                self._emit_text("=" * (best - level))
             self._head = reset + best - 1
             return self._pop(), level
         else:  # Found another closure
-            self._write_text("=" * best)
-            self._write_all(after)
+            self._emit_text("=" * best)
+            self._emit_all(after)
             return self._pop(), after_level
 
     def _really_parse_entity(self):
         """Actually parse an HTML entity and ensure that it is valid."""
-        self._write(tokens.HTMLEntityStart())
+        self._emit(tokens.HTMLEntityStart())
         self._head += 1
 
         this = self._read(strict=True)
         if this == "#":
             numeric = True
-            self._write(tokens.HTMLEntityNumeric())
+            self._emit(tokens.HTMLEntityNumeric())
             self._head += 1
             this = self._read(strict=True)
             if this[0].lower() == "x":
                 hexadecimal = True
-                self._write(tokens.HTMLEntityHex(char=this[0]))
+                self._emit(tokens.HTMLEntityHex(char=this[0]))
                 this = this[1:]
                 if not this:
                     self._fail_route()
@@ -396,8 +395,8 @@ class Tokenizer(object):
             if this not in htmlentities.entitydefs:
                 self._fail_route()
 
-        self._write(tokens.Text(text=this))
-        self._write(tokens.HTMLEntityEnd())
+        self._emit(tokens.Text(text=this))
+        self._emit(tokens.HTMLEntityEnd())
 
     def _parse_entity(self):
         """Parse an HTML entity at the head of the wikicode string."""
@@ -407,9 +406,9 @@ class Tokenizer(object):
             self._really_parse_entity()
         except BadRoute:
             self._head = reset
-            self._write_text(self._read())
+            self._emit_text(self._read())
         else:
-            self._write_all(self._pop())
+            self._emit_all(self._pop())
 
     def _parse_comment(self):
         """Parse an HTML comment at the head of the wikicode string."""
@@ -419,11 +418,11 @@ class Tokenizer(object):
             comment = self._parse(contexts.COMMENT)
         except BadRoute:
             self._head = reset
-            self._write_text("<!--")
+            self._emit_text("<!--")
         else:
-            self._write(tokens.CommentStart())
-            self._write_all(comment)
-            self._write(tokens.CommentEnd())
+            self._emit(tokens.CommentStart())
+            self._emit_all(comment)
+            self._emit(tokens.CommentEnd())
             self._head += 2
 
     def _parse_tag(self):
@@ -431,18 +430,18 @@ class Tokenizer(object):
         reset = self._head
         self._head += 1
         try:
-            tokens = self._really_parse_tag()
+            tag = self._really_parse_tag()
         except BadRoute:
             self._head = reset
-            self._write_text("<")
+            self._emit_text("<")
         else:
-            self._write_all(tokens)
+            self._emit_all(tag)
 
     def _really_parse_tag(self):
         """Actually parse an HTML tag, starting with the open (``<foo>``)."""
         data = _TagOpenData()
         self._push(contexts.TAG_OPEN)
-        self._write(tokens.TagOpenOpen(showtag=True))
+        self._emit(tokens.TagOpenOpen(showtag=True))
         while True:
             this, next = self._read(), self._read(1)
             can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
@@ -471,14 +470,14 @@ class Tokenizer(object):
     def _push_tag_buffer(self, data):
         """Write a pending tag attribute from *data* to the stack."""
         if data.context & data.CX_QUOTED:
-            self._write_first(tokens.TagAttrQuote())
-            self._write_all(self._pop())
+            self._emit_first(tokens.TagAttrQuote())
+            self._emit_all(self._pop())
         buf = data.padding_buffer
         while len(buf) < 3:
             buf.append("")
-        self._write_first(tokens.TagAttrStart(pad_after_eq=buf.pop(),
+        self._emit_first(tokens.TagAttrStart(pad_after_eq=buf.pop(),
             pad_before_eq=buf.pop(), pad_first=buf.pop()))
-        self._write_all(self._pop())
+        self._emit_all(self._pop())
         data.padding_buffer = []
 
     def _handle_tag_data(self, data, text):
@@ -508,7 +507,7 @@ class Tokenizer(object):
                     if not data.context & data.CX_NEED_EQUALS:
                         data.padding_buffer.append("")  # No padding before '='
                     data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE
-                    self._write(tokens.TagAttrEquals())
+                    self._emit(tokens.TagAttrEquals())
                     continue
                 if data.context & data.CX_NEED_EQUALS:
                     self._push_tag_buffer(data)
@@ -542,7 +541,7 @@ class Tokenizer(object):
         elif ctx & data.CX_ATTR_NAME:
             data.context |= data.CX_NEED_EQUALS
         if ctx & data.CX_QUOTED and not ctx & data.CX_NEED_SPACE:
-            self._write_text(text)
+            self._emit_text(text)
         else:
             data.padding_buffer.append(text)
 
@@ -550,7 +549,7 @@ class Tokenizer(object):
         """Handle regular *text* inside of an HTML open tag."""
         next = self._read(1)
         if not self._can_recurse() or text not in self.MARKERS:
-            self._write_text(text)
+            self._emit_text(text)
         elif text == next == "{":
             self._parse_template_or_argument()
         elif text == next == "[":
@@ -558,19 +557,19 @@ class Tokenizer(object):
         elif text == "<":
             self._parse_tag()
         else:
-            self._write_text(text)
+            self._emit_text(text)
 
     def _handle_tag_close_open(self, data, token):
         """Handle the closing of a open tag (``<foo>``)."""
         if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
             self._push_tag_buffer(data)
         padding = data.padding_buffer[0] if data.padding_buffer else ""
-        self._write(token(padding=padding))
+        self._emit(token(padding=padding))
         self._head += 1
 
     def _handle_tag_open_close(self):
         """Handle the opening of a closing tag (``</foo>``)."""
-        self._write(tokens.TagOpenClose())
+        self._emit(tokens.TagOpenClose())
         self._push(contexts.TAG_CLOSE)
         self._head += 1
 
@@ -581,8 +580,8 @@ class Tokenizer(object):
         if len(closing) != 1 or (not isinstance(closing[0], tokens.Text) or
                                  strip(closing[0]) != strip(self._stack[1])):
             self._fail_route()
-        self._write_all(closing)
-        self._write(tokens.TagCloseClose())
+        self._emit_all(closing)
+        self._emit(tokens.TagCloseClose())
         return self._pop()
 
     def _verify_safe(self, this):
@@ -661,7 +660,7 @@ class Tokenizer(object):
                         self._pop()
                     self._fail_route()
             if this not in self.MARKERS:
-                self._write_text(this)
+                self._emit_text(this)
                 self._head += 1
                 continue
             if this is self.END:
@@ -675,12 +674,12 @@ class Tokenizer(object):
                 if this == next == "-" and self._read(2) == ">":
                     return self._pop()
                 else:
-                    self._write_text(this)
+                    self._emit_text(this)
             elif this == next == "{":
                 if self._can_recurse():
                     self._parse_template_or_argument()
                 else:
-                    self._write_text("{")
+                    self._emit_text("{")
             elif this == "|" and self._context & contexts.TEMPLATE:
                 self._handle_template_param()
             elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
@@ -693,12 +692,12 @@ class Tokenizer(object):
                 if self._read(2) == "}":
                     return self._handle_argument_end()
                 else:
-                    self._write_text("}")
+                    self._emit_text("}")
             elif this == next == "[":
                 if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
                     self._parse_wikilink()
                 else:
-                    self._write_text("[")
+                    self._emit_text("[")
             elif this == "|" and self._context & contexts.WIKILINK_TITLE:
                 self._handle_wikilink_separator()
             elif this == next == "]" and self._context & contexts.WIKILINK:
@@ -707,7 +706,7 @@ class Tokenizer(object):
                 if self._read(-1) in ("\n", self.START):
                     self._parse_heading()
                 else:
-                    self._write_text("=")
+                    self._emit_text("=")
             elif this == "=" and self._context & contexts.HEADING:
                 return self._handle_heading_end()
             elif this == "\n" and self._context & contexts.HEADING:
@@ -718,18 +717,18 @@ class Tokenizer(object):
                 if self._read(2) == self._read(3) == "-":
                     self._parse_comment()
                 else:
-                    self._write_text(this)
+                    self._emit_text(this)
             elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
                 self._handle_tag_open_close()
             elif this == "<":
                 if not self._context & contexts.TAG_CLOSE and self._can_recurse():
                     self._parse_tag()
                 else:
-                    self._write_text("<")
+                    self._emit_text("<")
             elif this == ">" and self._context & contexts.TAG_CLOSE:
                 return self._handle_tag_close_close()
             else:
-                self._write_text(this)
+                self._emit_text(this)
             self._head += 1
 
     def tokenize(self, text):

From 4c7ed9c6a9d7c7a9eb32811048a0043578a130d8 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 3 Jul 2013 21:04:49 -0400
Subject: [PATCH 40/77] Add seven tests related to backslashes before quotes.

---
 tests/tokenizer/tags.mwtest | 53 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 89b2b2e..f75488d 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -56,8 +56,8 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" "
 
 name:   attribute_quoted
 label:  a tag with a single quoted attribute
-input:  "<ref name="foo"></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+input:  "<ref name="foo bar"></ref>"
+output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
@@ -373,3 +373,52 @@ name:   incomplete_no_tag_name_open_close
 label:  incomplete tags: no tag name within brackets; open and close
 input:  "junk <></>"
 output: [Text(text="junk <></>")]
+
+---
+
+name:   backslash_premature_before
+label:  a backslash before a quote before a space
+input:  "<foo attribute="this is\\" quoted">blah</foo>"
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+
+---
+
+name:   backslash_premature_after
+label:  a backslash before a quote after a space
+input:  "<foo attribute="this is \\"quoted">blah</foo>"
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+
+---
+
+name:   backslash_premature_middle
+label:  a backslash before a quote in the middle of a word
+input:  "<foo attribute="this i\\"s quoted">blah</foo>"
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+
+---
+
+name:   backslash_adjacent
+label:  escaped quotes next to unescaped quotes
+input:  "<foo attribute="\\"this is quoted\\"">blah</foo>"
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+
+---
+
+name:   backslash_endquote
+label:  backslashes before the end quote, causing the attribute to become unquoted
+input:  "<foo attribute="this_is quoted\\">blah</foo>"
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), Text(text="\"this_is"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\\\""), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+
+---
+
+name:   backslash_double
+label:  two adjacent backslashes, which do *not* affect the quote
+input:  "<foo attribute="this is\\\\" quoted">blah</foo>"
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+
+---
+
+name:   backslash_unaffecting
+label:  backslashes near quotes, but not immediately adjacent, thus having no effect
+input:  "<foo attribute="\\quote\\d" also="quote\\d\\">blah</foo>"
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quot\\ed\\\""), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]

From ffb554c36f45b6e4e72efad3bfed73e6c95a5b8e Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 4 Jul 2013 14:31:43 -0400
Subject: [PATCH 41/77] Fix broken tests.

---
 tests/tokenizer/tags.mwtest | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index f75488d..37ee50a 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -379,46 +379,46 @@ output: [Text(text="junk <></>")]
 name:   backslash_premature_before
 label:  a backslash before a quote before a space
 input:  "<foo attribute="this is\\" quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_premature_after
 label:  a backslash before a quote after a space
 input:  "<foo attribute="this is \\"quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_premature_middle
 label:  a backslash before a quote in the middle of a word
 input:  "<foo attribute="this i\\"s quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_adjacent
 label:  escaped quotes next to unescaped quotes
 input:  "<foo attribute="\\"this is quoted\\"">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_endquote
 label:  backslashes before the end quote, causing the attribute to become unquoted
 input:  "<foo attribute="this_is quoted\\">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), Text(text="\"this_is"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\\\""), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), Text(text="\"this_is"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_double
 label:  two adjacent backslashes, which do *not* affect the quote
 input:  "<foo attribute="this is\\\\" quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_unaffecting
 label:  backslashes near quotes, but not immediately adjacent, thus having no effect
 input:  "<foo attribute="\\quote\\d" also="quote\\d\\">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quot\\ed\\\""), TagCloseOpen(padding=""), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

From a42a704230e15e2397b2987ee01f5d742351773a Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 5 Jul 2013 23:02:10 -0400
Subject: [PATCH 42/77] Support backslash-escaped quotes in tags; CX_NEED_* ->
 CX_NOTE_*

---
 mwparserfromhell/parser/tokenizer.py | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 515a7a2..47c04a9 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -42,9 +42,9 @@ class _TagOpenData(object):
     CX_ATTR_NAME =   1 << 2
     CX_ATTR_VALUE =  1 << 3
     CX_QUOTED =      1 << 4
-    CX_NEED_SPACE =  1 << 5
-    CX_NEED_EQUALS = 1 << 6
-    CX_NEED_QUOTE =  1 << 7
+    CX_NOTE_SPACE =  1 << 5
+    CX_NOTE_EQUALS = 1 << 6
+    CX_NOTE_QUOTE =  1 << 7
 
     def __init__(self):
         self.context = self.CX_NAME
@@ -58,10 +58,10 @@ class Tokenizer(object):
     START = object()
     END = object()
     MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
-               "/", "-", "!", "\n", END]
+               "/", "\\", '"', "-", "!", "\n", END]
     MAX_DEPTH = 40
     MAX_CYCLES = 100000
-    regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
+    regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
     tag_splitter = re.compile(r"([\s\"\\])")
 
     def __init__(self):
@@ -445,7 +445,7 @@ class Tokenizer(object):
         while True:
             this, next = self._read(), self._read(1)
             can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
-                        data.context & data.CX_NEED_SPACE)
+                        data.context & data.CX_NOTE_SPACE)
             if this is self.END:
                 if self._context & contexts.TAG_ATTR:
                     if data.context & data.CX_QUOTED:
@@ -488,11 +488,11 @@ class Tokenizer(object):
             if data.context & data.CX_NAME:
                 if chunk in self.MARKERS or chunk.isspace():
                     self._fail_route()  # Tags must start with text, not spaces
-                data.context = data.CX_NEED_SPACE
+                data.context = data.CX_NOTE_SPACE
             elif chunk.isspace():
                 self._handle_tag_space(data, chunk)
                 continue
-            elif data.context & data.CX_NEED_SPACE:
+            elif data.context & data.CX_NOTE_SPACE:
                 if data.context & data.CX_QUOTED:
                     data.context = data.CX_ATTR_VALUE
                     self._pop()
@@ -504,43 +504,43 @@ class Tokenizer(object):
                 self._push(contexts.TAG_ATTR)
             elif data.context & data.CX_ATTR_NAME:
                 if chunk == "=":
-                    if not data.context & data.CX_NEED_EQUALS:
+                    if not data.context & data.CX_NOTE_EQUALS:
                         data.padding_buffer.append("")  # No padding before '='
-                    data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE
+                    data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE
                     self._emit(tokens.TagAttrEquals())
                     continue
-                if data.context & data.CX_NEED_EQUALS:
+                if data.context & data.CX_NOTE_EQUALS:
                     self._push_tag_buffer(data)
                     data.padding_buffer.append("")  # No padding before tag
                     data.context = data.CX_ATTR_NAME
                     self._push(contexts.TAG_ATTR)
             elif data.context & data.CX_ATTR_VALUE:
-                ### handle backslashes here
-                if data.context & data.CX_NEED_QUOTE:
-                    data.context ^= data.CX_NEED_QUOTE
-                    if chunk == '"':
+                escaped = self._read(-1) == "\\" and self._read(-2) != "\\"
+                if data.context & data.CX_NOTE_QUOTE:
+                    data.context ^= data.CX_NOTE_QUOTE
+                    if chunk == '"' and not escaped:
                         data.context |= data.CX_QUOTED
                         self._push(self._context)
                         data.reset = self._head
                         continue
                 elif data.context & data.CX_QUOTED:
-                    if chunk == '"':
-                        data.context |= data.CX_NEED_SPACE
+                    if chunk == '"' and not escaped:
+                        data.context |= data.CX_NOTE_SPACE
                         continue
             self._handle_tag_text(chunk)
 
     def _handle_tag_space(self, data, text):
         """Handle whitespace (*text*) inside of an HTML open tag."""
         ctx = data.context
-        end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NEED_QUOTE)
-        if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NEED_SPACE):
+        end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NOTE_QUOTE)
+        if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE):
             self._push_tag_buffer(data)
             data.context = data.CX_ATTR_READY
-        elif ctx & data.CX_NEED_SPACE:
+        elif ctx & data.CX_NOTE_SPACE:
             data.context = data.CX_ATTR_READY
         elif ctx & data.CX_ATTR_NAME:
-            data.context |= data.CX_NEED_EQUALS
-        if ctx & data.CX_QUOTED and not ctx & data.CX_NEED_SPACE:
+            data.context |= data.CX_NOTE_EQUALS
+        if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
             self._emit_text(text)
         else:
             data.padding_buffer.append(text)

From e5ada4753a889a592d8103fa795234083c5765ce Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 5 Jul 2013 23:02:27 -0400
Subject: [PATCH 43/77] Adding a test for triple backslashes.

---
 tests/tokenizer/tags.mwtest | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 37ee50a..b534fd2 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -418,6 +418,13 @@ output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" "
 
 ---
 
+name:   backslash_triple
+label:  three adjacent backslashes, which do *not* affect the quote
+input:  "<foo attribute="this is\\\\\\" quoted">blah</foo>"
+output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+
+---
+
 name:   backslash_unaffecting
 label:  backslashes near quotes, but not immediately adjacent, thus having no effect
 input:  "<foo attribute="\\quote\\d" also="quote\\d\\">blah</foo>"

From 4cfa40685ebc355ea366a1be4cd6f77c4e7809c9 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 5 Jul 2013 23:29:14 -0400
Subject: [PATCH 44/77] Clean up the way contexts are defined.

---
 mwparserfromhell/parser/contexts.py | 76 +++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index 9e5e568..211136c 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -85,43 +85,45 @@ Global contexts:
 
 # Local contexts:
 
-TEMPLATE =             0b000000000000000000000111
-TEMPLATE_NAME =        0b000000000000000000000001
-TEMPLATE_PARAM_KEY =   0b000000000000000000000010
-TEMPLATE_PARAM_VALUE = 0b000000000000000000000100
-
-ARGUMENT =             0b000000000000000000011000
-ARGUMENT_NAME =        0b000000000000000000001000
-ARGUMENT_DEFAULT =     0b000000000000000000010000
-
-WIKILINK =             0b000000000000000001100000
-WIKILINK_TITLE =       0b000000000000000000100000
-WIKILINK_TEXT =        0b000000000000000001000000
-
-HEADING =              0b000000000001111110000000
-HEADING_LEVEL_1 =      0b000000000000000010000000
-HEADING_LEVEL_2 =      0b000000000000000100000000
-HEADING_LEVEL_3 =      0b000000000000001000000000
-HEADING_LEVEL_4 =      0b000000000000010000000000
-HEADING_LEVEL_5 =      0b000000000000100000000000
-HEADING_LEVEL_6 =      0b000000000001000000000000
-
-COMMENT =              0b000000000010000000000000
-
-TAG =                  0b000000111100000000000000
-TAG_OPEN =             0b000000000100000000000000
-TAG_ATTR =             0b000000001000000000000000
-TAG_BODY =             0b000000010000000000000000
-TAG_CLOSE =            0b000000100000000000000000
-
-SAFETY_CHECK =         0b111111000000000000000000
-HAS_TEXT =             0b000001000000000000000000
-FAIL_ON_TEXT =         0b000010000000000000000000
-FAIL_NEXT  =           0b000100000000000000000000
-FAIL_ON_LBRACE =       0b001000000000000000000000
-FAIL_ON_RBRACE =       0b010000000000000000000000
-FAIL_ON_EQUALS =       0b100000000000000000000000
+TEMPLATE_NAME =        1 << 0
+TEMPLATE_PARAM_KEY =   1 << 1
+TEMPLATE_PARAM_VALUE = 1 << 2
+TEMPLATE = TEMPLATE_NAME + TEMPLATE_PARAM_KEY + TEMPLATE_PARAM_VALUE
+
+ARGUMENT_NAME =    1 << 3
+ARGUMENT_DEFAULT = 1 << 4
+ARGUMENT = ARGUMENT_NAME + ARGUMENT_DEFAULT
+
+WIKILINK_TITLE = 1 << 5
+WIKILINK_TEXT =  1 << 6
+WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT
+
+HEADING_LEVEL_1 = 1 << 7
+HEADING_LEVEL_2 = 1 << 8
+HEADING_LEVEL_3 = 1 << 9
+HEADING_LEVEL_4 = 1 << 10
+HEADING_LEVEL_5 = 1 << 11
+HEADING_LEVEL_6 = 1 << 12
+HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 +
+           HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6)
+
+COMMENT = 1 << 13
+
+TAG_OPEN =  1 << 14
+TAG_ATTR =  1 << 15
+TAG_BODY =  1 << 16
+TAG_CLOSE = 1 << 17
+TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE
+
+HAS_TEXT =       1 << 18
+FAIL_ON_TEXT =   1 << 19
+FAIL_NEXT  =     1 << 20
+FAIL_ON_LBRACE = 1 << 21
+FAIL_ON_RBRACE = 1 << 22
+FAIL_ON_EQUALS = 1 << 23
+SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
+                FAIL_ON_RBRACE + FAIL_ON_EQUALS)
 
 # Global contexts:
 
-GL_HEADING = 0b1
+GL_HEADING = 1 << 0

From 2a71c1889235e729b2c472702b40cfe1e6145ed0 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 5 Jul 2013 23:51:16 -0400
Subject: [PATCH 45/77] Four tests for <nowiki> + friends.

---
 tests/tokenizer/tags.mwtest | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index b534fd2..00bdf37 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -429,3 +429,31 @@ name:   backslash_unaffecting
 label:  backslashes near quotes, but not immediately adjacent, thus having no effect
 input:  "<foo attribute="\\quote\\d" also="quote\\d\\">blah</foo>"
 output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+
+---
+
+name:   unparsable
+label:  a tag that should not be put through the normal parser
+input:  "{{t1}}<nowiki>{{t2}}</nowiki>{{t3}}"
+output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
+
+---
+
+name:   unparsable_complex
+label:  a tag that should not be put through the normal parser; lots of stuff inside
+input:  "{{t1}}<pre>{{t2}}\n==Heading==\nThis is some text with a [[page|link]].</pre>{{t3}}"
+output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="pre"), TagCloseOpen(padding=""), Text(text="{{t2}}\n==Heading==\nThis is some text with a [[page|link]]."), TagOpenClose(), Text(text="pre"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
+
+---
+
+name:   unparsable_attributed
+label:  a tag that should not be put through the normal parser; parsed attributes
+input:  "{{t1}}<nowiki attr=val attr2="{{val2}}">{{t2}}</nowiki>{{t3}}"
+output: [TemplateOpen(), Text(text=u't1'), TemplateClose(), TagOpenOpen(showtag=True), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
+
+---
+
+name:   unparsable_incomplete
+label:  a tag that should not be put through the normal parser; incomplete
+input:  "{{t1}}<nowiki>{{t2}}{{t3}}"
+output: [TemplateOpen(), Text(text="t1"), TemplateClose(), Text(text="<nowiki>"), TemplateOpen(), Text(text="t2"), TemplateClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]

From fb92012fcb351c49a39a8535f203921f597e92d8 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 6 Jul 2013 00:12:06 -0400
Subject: [PATCH 46/77] Support parser-blacklisted tags like <nowiki>

---
 mwparserfromhell/parser/tokenizer.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 47c04a9..45dfd01 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -459,7 +459,9 @@ class Tokenizer(object):
             elif this == ">" and can_exit:
                 self._handle_tag_close_open(data, tokens.TagCloseOpen)
                 self._context = contexts.TAG_BODY
-                return self._parse(push=False)
+                if is_parsable(self._stack[1].text):
+                    return self._parse(push=False)
+                return self._handle_blacklisted_tag()
             elif this == "/" and next == ">" and can_exit:
                 self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
                 return self._pop()
@@ -559,6 +561,19 @@ class Tokenizer(object):
         else:
             self._emit_text(text)
 
+    def _handle_blacklisted_tag(self):
+        """Handle the body of an HTML tag that is parser-blacklisted."""
+        while True:
+            this, next = self._read(), self._read(1)
+            self._head += 1
+            if this is self.END:
+                self._fail_route()
+            elif this == "<" and next == "/":
+                self._handle_tag_open_close()
+                return self._parse(push=False)
+            else:
+                self._emit_text(this)
+
     def _handle_tag_close_open(self, data, token):
         """Handle the closing of a open tag (``<foo>``)."""
         if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):

From 50beda09143697946886d51756a18e9cf6e1ef89 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 6 Jul 2013 01:47:33 -0400
Subject: [PATCH 47/77] Improve/fix the way padding is handled.

---
 mwparserfromhell/parser/tokenizer.py | 25 +++++++++++--------------
 tests/tokenizer/integration.mwtest   |  4 ++--
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 45dfd01..7247148 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -48,7 +48,7 @@ class _TagOpenData(object):
 
     def __init__(self):
         self.context = self.CX_NAME
-        self.padding_buffer = []
+        self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""}
         self.reset = 0
 
 
@@ -62,7 +62,7 @@ class Tokenizer(object):
     MAX_DEPTH = 40
     MAX_CYCLES = 100000
     regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
-    tag_splitter = re.compile(r"([\s\"\\])")
+    tag_splitter = re.compile(r"([\s\"\\]+)")
 
     def __init__(self):
         self._text = None
@@ -475,12 +475,10 @@ class Tokenizer(object):
             self._emit_first(tokens.TagAttrQuote())
             self._emit_all(self._pop())
         buf = data.padding_buffer
-        while len(buf) < 3:
-            buf.append("")
-        self._emit_first(tokens.TagAttrStart(pad_after_eq=buf.pop(),
-            pad_before_eq=buf.pop(), pad_first=buf.pop()))
+        self._emit_first(tokens.TagAttrStart(pad_first=buf["first"],
+            pad_before_eq=buf["before_eq"], pad_after_eq=buf["after_eq"]))
         self._emit_all(self._pop())
-        data.padding_buffer = []
+        data.padding_buffer = {key: "" for key in data.padding_buffer}
 
     def _handle_tag_data(self, data, text):
         """Handle all sorts of *text* data inside of an HTML open tag."""
@@ -506,14 +504,11 @@ class Tokenizer(object):
                 self._push(contexts.TAG_ATTR)
             elif data.context & data.CX_ATTR_NAME:
                 if chunk == "=":
-                    if not data.context & data.CX_NOTE_EQUALS:
-                        data.padding_buffer.append("")  # No padding before '='
                     data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE
                     self._emit(tokens.TagAttrEquals())
                     continue
                 if data.context & data.CX_NOTE_EQUALS:
                     self._push_tag_buffer(data)
-                    data.padding_buffer.append("")  # No padding before tag
                     data.context = data.CX_ATTR_NAME
                     self._push(contexts.TAG_ATTR)
             elif data.context & data.CX_ATTR_VALUE:
@@ -542,10 +537,13 @@ class Tokenizer(object):
             data.context = data.CX_ATTR_READY
         elif ctx & data.CX_ATTR_NAME:
             data.context |= data.CX_NOTE_EQUALS
+            data.padding_buffer["before_eq"] += text
         if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
             self._emit_text(text)
-        else:
-            data.padding_buffer.append(text)
+        elif data.context & data.CX_ATTR_READY:
+            data.padding_buffer["first"] += text
+        elif data.context & data.CX_ATTR_VALUE:
+            data.padding_buffer["after_eq"] += text
 
     def _handle_tag_text(self, text):
         """Handle regular *text* inside of an HTML open tag."""
@@ -578,8 +576,7 @@ class Tokenizer(object):
         """Handle the closing of a open tag (``<foo>``)."""
         if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
             self._push_tag_buffer(data)
-        padding = data.padding_buffer[0] if data.padding_buffer else ""
-        self._emit(token(padding=padding))
+        self._emit(token(padding=data.padding_buffer["first"]))
         self._head += 1
 
     def _handle_tag_open_close(self):
diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest
index 736ecb1..76e0429 100644
--- a/tests/tokenizer/integration.mwtest
+++ b/tests/tokenizer/integration.mwtest
@@ -35,8 +35,8 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t
 
 name:   rich_tags
 label:  a HTML tag with tons of other things in it
-input:  "{{dubious claim}}<ref name={{abc}}  foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} mno="{{p}} [[q]] {{r}}">[[Source]]</ref>"
-output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+input:  "{{dubious claim}}<ref name={{abc}}   foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} \n mno =  "{{p}} [[q]] {{r}}">[[Source]]</ref>"
+output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first="   ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq="  "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 

From d3a407888079d25a99acc82dd600df1d1acdb7ba Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 7 Jul 2013 16:47:19 -0400
Subject: [PATCH 48/77] Tests for single and single_only tags.

---
 tests/tokenizer/tags.mwtest | 63 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 00bdf37..a07f6c5 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -355,6 +355,13 @@ output: [Text(text="junk <ref>bar</span>")]
 
 ---
 
+name:   incomplete_close
+label:  incomplete tags: a close tag
+input:  "junk </ref>"
+output: [Text(text="junk </ref>")]
+
+---
+
 name:   incomplete_no_tag_name_open
 label:  incomplete tags: no tag name within brackets; just an open
 input:  "junk <>"
@@ -457,3 +464,59 @@ name:   unparsable_incomplete
 label:  a tag that should not be put through the normal parser; incomplete
 input:  "{{t1}}<nowiki>{{t2}}{{t3}}"
 output: [TemplateOpen(), Text(text="t1"), TemplateClose(), Text(text="<nowiki>"), TemplateOpen(), Text(text="t2"), TemplateClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
+
+---
+
+name:   single_open_close
+label:  a tag that supports being single; both an open and a close tag
+input:  "foo<li>bar{{baz}}</li>"
+output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="li"), TagCloseOpen(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenClose(), Text(text="li"), TagCloseClose()]
+
+---
+
+name:   single_open
+label:  a tag that supports being single; just an open tag
+input:  "foo<li>bar{{baz}}"
+output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="li"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+
+---
+
+name:   single_selfclose
+label:  a tag that supports being single; a self-closing tag
+input:  "foo<li/>bar{{baz}}"
+output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="li"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+
+---
+
+name:   single_close
+label:  a tag that supports being single; just a close tag
+input:  "foo</li>bar{{baz}}"
+output: [Text(text="foo</li>bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+
+---
+
+name:   single_only_open_close
+label:  a tag that can only be single; both an open and a close tag
+input:  "foo<br>bar{{baz}}</br>"
+output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="")]
+
+---
+
+name:   single_only_open
+label:  a tag that can only be single; just an open tag
+input:  "foo<br>bar{{baz}}"
+output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+
+---
+
+name:   single_only_selfclose
+label:  a tag that can only be single; a self-closing tag
+input:  "foo<br/>bar{{baz}}"
+output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+
+---
+
+name:   single_only_close
+label:  a tag that can only be single; just a close tag
+input:  "foo</br>bar{{baz}}"
+output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

From 7d1a28a249d9c4e0dedc406154a1482a40fed9a2 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 9 Jul 2013 19:38:34 -0400
Subject: [PATCH 49/77] Support single and single-only tags like <br>.

---
 mwparserfromhell/nodes/tag.py      | 79 +++++++++++++++++++++++++-------------
 mwparserfromhell/parser/builder.py | 23 ++++++-----
 mwparserfromhell/parser/tokens.py  |  4 ++
 mwparserfromhell/tag_defs.py       | 11 +++++-
 4 files changed, 81 insertions(+), 36 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index 76b412c..dc78b34 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -33,20 +33,20 @@ class Tag(Node):
     """Represents an HTML-style tag in wikicode, like ``<ref>``."""
 
     def __init__(self, tag, contents=None, attrs=None, showtag=True,
-                 self_closing=False, padding="", closing_tag=None):
+                 self_closing=False, invalid=False, implicit=False, padding="",
+                 closing_tag=None):
         super(Tag, self).__init__()
         self._tag = tag
         self._contents = contents
-        if attrs:
-            self._attrs = attrs
-        else:
-            self._attrs = []
+        self._attrs = attrs if attrs else []
         self._showtag = showtag
         self._self_closing = self_closing
+        self._invalid = invalid
+        self._implicit = implicit
         self._padding = padding
         if closing_tag:
             self._closing_tag = closing_tag
-        else:
+        elif not self_closing:
             self._closing_tag = tag
 
     def __unicode__(self):
@@ -57,11 +57,11 @@ class Tag(Node):
             else:
                 return open_ + str(self.contents) + close
 
-        result = "<" + str(self.tag)
+        result = ("</" if self.invalid else "<") + str(self.tag)
         if self.attributes:
             result += "".join([str(attr) for attr in self.attributes])
         if self.self_closing:
-            result += self.padding + "/>"
+            result += self.padding + (">" if self.implicit else "/>")
         else:
             result += self.padding + ">" + str(self.contents)
             result += "</" + str(self.closing_tag) + ">"
@@ -81,6 +81,9 @@ class Tag(Node):
         if self.contents:
             for child in getter(self.contents):
                 yield self.contents, child
+        if not self.self_closing and self.closing_tag:
+            for child in getter(self.closing_tag):
+                yield self.closing_tag, child
 
     def __strip__(self, normalize, collapse):
         if is_visible(self.tag):
@@ -88,27 +91,22 @@ class Tag(Node):
         return None
 
     def __showtree__(self, write, get, mark):
-        tagnodes = self.tag.nodes
-        if not self.attributes and (len(tagnodes) == 1 and
-                                    isinstance(tagnodes[0], Text)):
-            write("<" + str(tagnodes[0]) + ">")
+        write("</" if self.invalid else "<")
+        get(self.tag)
+        for attr in self.attributes:
+            get(attr.name)
+            if not attr.value:
+                continue
+            write("    = ")
+            mark()
+            get(attr.value)
+        if self.self_closing:
+            write(">" if self.implicit else "/>")
         else:
-            write("<")
-            get(self.tag)
-            for attr in self.attributes:
-                get(attr.name)
-                if not attr.value:
-                    continue
-                write("    = ")
-                mark()
-                get(attr.value)
             write(">")
-        get(self.contents)
-        if len(tagnodes) == 1 and isinstance(tagnodes[0], Text):
-            write("</" + str(tagnodes[0]) + ">")
-        else:
+            get(self.contents)
             write("</")
-            get(self.tag)
+            get(self.closing_tag)
             write(">")
 
     @property
@@ -140,6 +138,27 @@ class Tag(Node):
         return self._self_closing
 
     @property
+    def invalid(self):
+        """Whether the tag starts with a backslash after the opening bracket.
+
+        This makes the tag look like a lone close tag. It is technically
+        invalid and is only parsable Wikicode when the tag itself is
+        single-only, like ``<br>`` and ``<img>``. See
+        :py:func:`tag_defs.is_single_only`.
+        """
+        return self._invalid
+
+    @property
+    def implicit(self):
+        """Whether the tag is implicitly self-closing, with no ending slash.
+
+        This is only possible for specific "single" tags like ``<br>`` and
+        ``<li>``. See :py:func:`tag_defs.is_single`. This field only has an
+        effect if :py:attr:`self_closing` is also ``True``.
+        """
+        return self._implicit
+
+    @property
     def padding(self):
         """Spacing to insert before the first closing ``>``."""
         return self._padding
@@ -169,6 +188,14 @@ class Tag(Node):
     def self_closing(self, value):
         self._self_closing = bool(value)
 
+    @invalid.setter
+    def invalid(self, value):
+        self._invalid = bool(value)
+
+    @implicit.setter
+    def implicit(self, value):
+        self._implicit = bool(value)
+
     @padding.setter
     def padding(self, value):
         self._padding = str(value)
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index d92b845..6d31060 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -205,8 +205,9 @@ class Builder(object):
 
     def _handle_tag(self, token):
         """Handle a case where a tag is at the head of the tokens."""
-        showtag = token.showtag
-        attrs = []
+        showtag, invalid = token.showtag, token.get("invalid", False)
+        implicit, attrs, contents, closing_tag = False, [], None, None
+        close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
         self._push()
         while self._tokens:
             token = self._tokens.pop()
@@ -216,16 +217,20 @@ class Builder(object):
                 padding = token.padding
                 tag = self._pop()
                 self._push()
-            elif isinstance(token, tokens.TagCloseSelfclose):
-                tag = self._pop()
-                return Tag(tag, attrs=attrs, showtag=showtag,
-                           self_closing=True, padding=token.padding)
             elif isinstance(token, tokens.TagOpenClose):
                 contents = self._pop()
                 self._push()
-            elif isinstance(token, tokens.TagCloseClose):
-                return Tag(tag, contents, attrs, showtag, False, padding,
-                           self._pop())
+            elif isinstance(token, close_tokens):
+                if isinstance(token, tokens.TagCloseSelfclose):
+                    tag = self._pop()
+                    self_closing = True
+                    padding = token.padding
+                    implicit = token.get("implicit", False)
+                else:
+                    self_closing = False
+                    closing_tag = self._pop()
+                return Tag(tag, contents, attrs, showtag, self_closing,
+                           invalid, implicit, padding, closing_tag)
             else:
                 self._write(self._handle_token(token))
 
diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py
index b11ca15..f3d89fc 100644
--- a/mwparserfromhell/parser/tokens.py
+++ b/mwparserfromhell/parser/tokens.py
@@ -63,6 +63,10 @@ class Token(object):
     def __delattr__(self, key):
         del self._kwargs[key]
 
+    def get(self, key, default=None):
+        """Same as :py:meth:`__getattr__`, but has a *default* if missing."""
+        return self._kwargs.get(key, default)
+
 
 def make(name):
     """Create a new Token class using ``type()`` and add it to ``__all__``."""
diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py
index 369692b..73493d3 100644
--- a/mwparserfromhell/tag_defs.py
+++ b/mwparserfromhell/tag_defs.py
@@ -24,7 +24,8 @@
 
 from __future__ import unicode_literals
 
-__all__ = ["get_wikicode", "is_parsable", "is_visible"]
+__all__ = ["get_wikicode", "is_parsable", "is_visible", "is_single",
+           "is_single_only"]
 
 PARSER_BLACKLIST = [
     # enwiki extensions @ 2013-06-28
@@ -65,3 +66,11 @@ def is_parsable(tag):
 def is_visible(tag):
     """Return whether or not the given *tag* contains visible text."""
     return tag.lower() not in INVISIBLE_TAGS
+
+def is_single(tag):
+    """Return whether or not the given *tag* can exist without a close tag."""
+    return tag.lower() in SINGLE
+
+def is_single_only(tag):
+    """Return whether or not the given *tag* must exist without a close tag."""
+    return tag.lower() in SINGLE_ONLY

From b24ddaea1020df3ba0a81413feed981cf34267d8 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 9 Jul 2013 22:23:06 -0400
Subject: [PATCH 50/77] Tokenizer support for implicitly self-closing tags.

---
 mwparserfromhell/parser/tokenizer.py | 35 +++++++++++++++++++++++++----------
 tests/tokenizer/tags.mwtest          |  7 +++++++
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 7247148..308852d 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -26,7 +26,7 @@ import re
 
 from . import contexts, tokens
 from ..compat import htmlentities
-from ..tag_defs import is_parsable
+from ..tag_defs import is_parsable, is_single, is_single_only
 
 __all__ = ["Tokenizer"]
 
@@ -596,6 +596,29 @@ class Tokenizer(object):
         self._emit(tokens.TagCloseClose())
         return self._pop()
 
+    def _handle_single_end(self):
+        """Handle the steam end when inside a single-supporting HTML tag."""
+        gen = enumerate(self._stack)
+        index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen))
+        padding = self._stack[index].padding
+        token = tokens.TagCloseSelfclose(padding=padding, implicit=True)
+        self._stack[index] = token
+        return self._pop()
+
+    def _handle_end(self):
+        """Handle the end of the stream of wikitext."""
+        fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
+                contexts.HEADING | contexts.COMMENT | contexts.TAG)
+        double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
+        if self._context & fail:
+            if self._context & contexts.TAG_BODY:
+                if is_single(self._stack[1].text):
+                    return self._handle_single_end()
+            if self._context & double_fail:
+                self._pop()
+            self._fail_route()
+        return self._pop()
+
     def _verify_safe(self, this):
         """Make sure we are not trying to write an invalid character."""
         context = self._context
@@ -658,10 +681,6 @@ class Tokenizer(object):
         unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
                   contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
                   contexts.TAG_CLOSE)
-        fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
-                contexts.HEADING | contexts.COMMENT | contexts.TAG)
-        double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
-
         if push:
             self._push(context)
         while True:
@@ -676,11 +695,7 @@ class Tokenizer(object):
                 self._head += 1
                 continue
             if this is self.END:
-                if self._context & fail:
-                    if self._context & double_fail:
-                        self._pop()
-                    self._fail_route()
-                return self._pop()
+                return self._handle_end()
             next = self._read(1)
             if self._context & contexts.COMMENT:
                 if this == next == "-" and self._read(2) == ">":
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index a07f6c5..6dd67ff 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -520,3 +520,10 @@ name:   single_only_close
 label:  a tag that can only be single; just a close tag
 input:  "foo</br>bar{{baz}}"
 output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+
+---
+
+name:   single_only_double
+label:  a tag that can only be single; a tag with backslashes at the beginning and end
+input:  "foo</br/>bar{{baz}}"
+output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

From dcdd07253b0d42708a66fde77188ef4d93112009 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 9 Jul 2013 23:19:05 -0400
Subject: [PATCH 51/77] Correctly tokenize single-only HTML tags, part one.

---
 mwparserfromhell/parser/tokenizer.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 308852d..3873f58 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -459,6 +459,8 @@ class Tokenizer(object):
             elif this == ">" and can_exit:
                 self._handle_tag_close_open(data, tokens.TagCloseOpen)
                 self._context = contexts.TAG_BODY
+                if is_single_only(self._stack[1].text):
+                    return self._handle_single_only_tag()
                 if is_parsable(self._stack[1].text):
                     return self._parse(push=False)
                 return self._handle_blacklisted_tag()
@@ -596,8 +598,16 @@ class Tokenizer(object):
         self._emit(tokens.TagCloseClose())
         return self._pop()
 
-    def _handle_single_end(self):
-        """Handle the steam end when inside a single-supporting HTML tag."""
+    def _handle_single_only_tag(self):
+        """Handle the end of an implicitly closing single-only HTML tag."""
+        padding = self._stack.pop().padding
+        token = tokens.TagCloseSelfclose(padding=padding, implicit=True)
+        self._stack.append(token)
+        self._head -= 1
+        return self._pop()
+
+    def _handle_single_tag_end(self):
+        """Handle the stream end when inside a single-supporting HTML tag."""
         gen = enumerate(self._stack)
         index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen))
         padding = self._stack[index].padding
@@ -613,7 +623,7 @@ class Tokenizer(object):
         if self._context & fail:
             if self._context & contexts.TAG_BODY:
                 if is_single(self._stack[1].text):
-                    return self._handle_single_end()
+                    return self._handle_single_tag_end()
             if self._context & double_fail:
                 self._pop()
             self._fail_route()

From 929fa25e1fcd89d7e9e1d456aac0404c284906c8 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 10 Jul 2013 01:05:08 -0400
Subject: [PATCH 52/77] Correctly tokenize single-only HTML tags, part two.

---
 mwparserfromhell/parser/tokenizer.py | 34 ++++++++++++++++++++++++++--------
 tests/tokenizer/tags.mwtest          |  6 +++---
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 3873f58..4192cc4 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -460,7 +460,7 @@ class Tokenizer(object):
                 self._handle_tag_close_open(data, tokens.TagCloseOpen)
                 self._context = contexts.TAG_BODY
                 if is_single_only(self._stack[1].text):
-                    return self._handle_single_only_tag()
+                    return self._handle_single_only_tag_end()
                 if is_parsable(self._stack[1].text):
                     return self._parse(push=False)
                 return self._handle_blacklisted_tag()
@@ -598,12 +598,26 @@ class Tokenizer(object):
         self._emit(tokens.TagCloseClose())
         return self._pop()
 
-    def _handle_single_only_tag(self):
+    def _handle_invalid_tag_start(self):
+        """Handle the (possible) start of an implicitly closing single tag."""
+        reset = self._head + 1
+        self._head += 2
+        try:
+            if not is_single_only(self.tag_splitter.split(self._read())[0]):
+                raise BadRoute()
+            tag = self._really_parse_tag()
+        except BadRoute:
+            self._head = reset
+            self._emit_text("</")
+        else:
+            tag[0].invalid = True  # Set flag of TagOpenOpen
+            self._emit_all(tag)
+
+    def _handle_single_only_tag_end(self):
         """Handle the end of an implicitly closing single-only HTML tag."""
         padding = self._stack.pop().padding
-        token = tokens.TagCloseSelfclose(padding=padding, implicit=True)
-        self._stack.append(token)
-        self._head -= 1
+        self._emit(tokens.TagCloseSelfclose(padding=padding, implicit=True))
+        self._head -= 1  # Offset displacement done by _handle_tag_close_open
         return self._pop()
 
     def _handle_single_tag_end(self):
@@ -691,13 +705,14 @@ class Tokenizer(object):
         unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
                   contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
                   contexts.TAG_CLOSE)
+        double_unsafe = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
         if push:
             self._push(context)
         while True:
             this = self._read()
             if self._context & unsafe:
                 if not self._verify_safe(this):
-                    if self._context & double_fail:
+                    if self._context & double_unsafe:
                         self._pop()
                     self._fail_route()
             if this not in self.MARKERS:
@@ -755,8 +770,11 @@ class Tokenizer(object):
                     self._parse_comment()
                 else:
                     self._emit_text(this)
-            elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
-                self._handle_tag_open_close()
+            elif this == "<" and next == "/" and self._read(2) is not self.END:
+                if self._context & contexts.TAG_BODY:
+                    self._handle_tag_open_close()
+                else:
+                    self._handle_invalid_tag_start()
             elif this == "<":
                 if not self._context & contexts.TAG_CLOSE and self._can_recurse():
                     self._parse_tag()
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 6dd67ff..6c232bb 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -498,7 +498,7 @@ output: [Text(text="foo</li>bar"), TemplateOpen(), Text(text="baz"), TemplateClo
 name:   single_only_open_close
 label:  a tag that can only be single; both an open and a close tag
 input:  "foo<br>bar{{baz}}</br>"
-output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="")]
+output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True)]
 
 ---
 
@@ -519,11 +519,11 @@ output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseS
 name:   single_only_close
 label:  a tag that can only be single; just a close tag
 input:  "foo</br>bar{{baz}}"
-output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
 
 ---
 
 name:   single_only_double
 label:  a tag that can only be single; a tag with backslashes at the beginning and end
 input:  "foo</br/>bar{{baz}}"
-output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

From 5018d9d323f2494838b2e8d12ba72586c133270b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 10 Jul 2013 01:33:45 -0400
Subject: [PATCH 53/77] Make showtag=True implicit.

---
 mwparserfromhell/parser/builder.py   |  5 ++-
 mwparserfromhell/parser/tokenizer.py |  2 +-
 tests/test_builder.py                |  2 +-
 tests/tokenizer/integration.mwtest   |  2 +-
 tests/tokenizer/tags.mwtest          | 84 ++++++++++++++++++------------------
 5 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 6d31060..5822678 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -205,9 +205,10 @@ class Builder(object):
 
     def _handle_tag(self, token):
         """Handle a case where a tag is at the head of the tokens."""
-        showtag, invalid = token.showtag, token.get("invalid", False)
-        implicit, attrs, contents, closing_tag = False, [], None, None
         close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
+        implicit, attrs, contents, closing_tag = False, [], None, None
+        showtag = token.get("showtag", True)
+        invalid = token.get("invalid", False)
         self._push()
         while self._tokens:
             token = self._tokens.pop()
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 4192cc4..c53b022 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -441,7 +441,7 @@ class Tokenizer(object):
         """Actually parse an HTML tag, starting with the open (``<foo>``)."""
         data = _TagOpenData()
         self._push(contexts.TAG_OPEN)
-        self._emit(tokens.TagOpenOpen(showtag=True))
+        self._emit(tokens.TagOpenOpen())
         while True:
             this, next = self._read(), self._read(1)
             can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
diff --git a/tests/test_builder.py b/tests/test_builder.py
index 0c635ce..2be1e5e 100644
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -193,7 +193,7 @@ class TestBuilder(TreeEqualityTestCase):
     def test_tag(self):
         """tests for building Tag nodes"""
         tests = [
-            ([tokens.TagOpenOpen(showtag=True), tokens.Text(text="ref"),
+            ([tokens.TagOpenOpen(), tokens.Text(text="ref"),
               tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(),
               tokens.Text(text="ref"), tokens.TagCloseClose()],
              wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "",
diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest
index 76e0429..0277a51 100644
--- a/tests/tokenizer/integration.mwtest
+++ b/tests/tokenizer/integration.mwtest
@@ -36,7 +36,7 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t
 name:   rich_tags
 label:  a HTML tag with tons of other things in it
 input:  "{{dubious claim}}<ref name={{abc}}   foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} \n mno =  "{{p}} [[q]] {{r}}">[[Source]]</ref>"
-output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first="   ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq="  "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first="   ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq="  "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 6c232bb..50c844e 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -1,112 +1,112 @@
 name:   basic
 label:  a basic tag with an open and close
 input:  "<ref></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   basic_selfclosing
 label:  a basic self-closing tag
 input:  "<ref/>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(), Text(text="ref"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   content
 label:  a tag with some content in the middle
 input:  "<ref>this is a reference</ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   padded_open
 label:  a tag with some padding in the open tag
 input:  "<ref ></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   padded_close
 label:  a tag with some padding in the close tag
 input:  "<ref></ref >"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]
 
 ---
 
 name:   padded_selfclosing
 label:  a self-closing tag with padding
 input:  "<ref />"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding=" ")]
+output: [TagOpenOpen(), Text(text="ref"), TagCloseSelfclose(padding=" ")]
 
 ---
 
 name:   attribute
 label:  a tag with a single attribute
 input:  "<ref name></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_value
 label:  a tag with a single attribute with a value
 input:  "<ref name=foo></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_quoted
 label:  a tag with a single quoted attribute
 input:  "<ref name="foo bar"></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_hyphen
 label:  a tag with a single attribute, containing a hyphen
 input:  "<ref name=foo-bar></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_quoted_hyphen
 label:  a tag with a single quoted attribute, containing a hyphen
 input:  "<ref name="foo-bar"></ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   attribute_selfclosing
 label:  a self-closing tag with a single attribute
 input:  "<ref name/>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   attribute_selfclosing_value
 label:  a self-closing tag with a single attribute with a value
 input:  "<ref name=foo/>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   attribute_selfclosing_value_quoted
 label:  a self-closing tag with a single quoted attribute
 input:  "<ref name="foo"/>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
 
 ---
 
 name:   nested_tag
 label:  a tag nested within the attributes of another
 input:  "<ref name=<span style="color: red;">foo</span>>citation</ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   nested_tag_quoted
 label:  a tag nested within the attributes of another, quoted
 input:  "<ref name="<span style="color: red;">foo</span>">citation</ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
@@ -120,7 +120,7 @@ output: [Text(text="<ref name=</ ><//>>citation</ref>")]
 name:   nested_troll_tag_quoted
 label:  a bogus tag that appears to be nested within the attributes of another, quoted
 input:  "<ref name="</ ><//>">citation</ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="</ ><//>"), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="</ ><//>"), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
@@ -141,7 +141,7 @@ output: [Text(text="<ref>test</ ref>")]
 name:   valid_space_end
 label:  valid tag: spaces at the ends of both the open and close tags
 input:  "<ref >test</ref >"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), Text(text="test"), TagOpenClose(), Text(text="ref "), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=" "), Text(text="test"), TagOpenClose(), Text(text="ref "), TagCloseClose()]
 
 ---
 
@@ -162,14 +162,14 @@ output: [Text(text="<ref "), TemplateOpen(), Text(text="foo"), TemplateClose(),
 name:   valid_template_end_open
 label:  valid tag: a template at the end of the open tag
 input:  "<ref {{foo}}>test</ref>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
 
 ---
 
 name:   valid_template_end_open_space_end_close
 label:  valid tag: a template at the end of the open tag; whitespace at the end of the close tag
 input:  "<ref {{foo}}>test</ref\n>"
-output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref\n"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref\n"), TagCloseClose()]
 
 ---
 
@@ -197,21 +197,21 @@ output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text
 name:   unclosed_quote
 label:  a quoted attribute that is never closed
 input:  "<span style="foobar>stuff</span>"
-output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foobar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foobar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
 
 ---
 
 name:   fake_quote
 label:  a fake quoted attribute
 input:  "<span style="foo"bar>stuff</span>"
-output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo\"bar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo\"bar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
 
 ---
 
 name:   fake_quote_complex
 label:  a fake quoted attribute, with spaces and templates and links
 input:  "<span style="foo {{bar}}\n[[baz]]"buzz >stuff</span>"
-output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo"), TagAttrStart(pad_first=" ", pad_before_eq="\n", pad_after_eq=""), TemplateOpen(), Text(text="bar"), TemplateClose(), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), WikilinkOpen(), Text(text="baz"), WikilinkClose(), Text(text="\"buzz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo"), TagAttrStart(pad_first=" ", pad_before_eq="\n", pad_after_eq=""), TemplateOpen(), Text(text="bar"), TemplateClose(), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), WikilinkOpen(), Text(text="baz"), WikilinkClose(), Text(text="\"buzz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]
 
 ---
 
@@ -386,77 +386,77 @@ output: [Text(text="junk <></>")]
 name:   backslash_premature_before
 label:  a backslash before a quote before a space
 input:  "<foo attribute="this is\\" quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_premature_after
 label:  a backslash before a quote after a space
 input:  "<foo attribute="this is \\"quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_premature_middle
 label:  a backslash before a quote in the middle of a word
 input:  "<foo attribute="this i\\"s quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_adjacent
 label:  escaped quotes next to unescaped quotes
 input:  "<foo attribute="\\"this is quoted\\"">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_endquote
 label:  backslashes before the end quote, causing the attribute to become unquoted
 input:  "<foo attribute="this_is quoted\\">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), Text(text="\"this_is"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), Text(text="\"this_is"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_double
 label:  two adjacent backslashes, which do *not* affect the quote
 input:  "<foo attribute="this is\\\\" quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_triple
 label:  three adjacent backslashes, which do *not* affect the quote
 input:  "<foo attribute="this is\\\\\\" quoted">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   backslash_unaffecting
 label:  backslashes near quotes, but not immediately adjacent, thus having no effect
 input:  "<foo attribute="\\quote\\d" also="quote\\d\\">blah</foo>"
-output: [TagOpenOpen(showtag=True), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
+output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
 
 ---
 
 name:   unparsable
 label:  a tag that should not be put through the normal parser
 input:  "{{t1}}<nowiki>{{t2}}</nowiki>{{t3}}"
-output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
+output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
 
 ---
 
 name:   unparsable_complex
 label:  a tag that should not be put through the normal parser; lots of stuff inside
 input:  "{{t1}}<pre>{{t2}}\n==Heading==\nThis is some text with a [[page|link]].</pre>{{t3}}"
-output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="pre"), TagCloseOpen(padding=""), Text(text="{{t2}}\n==Heading==\nThis is some text with a [[page|link]]."), TagOpenClose(), Text(text="pre"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
+output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="pre"), TagCloseOpen(padding=""), Text(text="{{t2}}\n==Heading==\nThis is some text with a [[page|link]]."), TagOpenClose(), Text(text="pre"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
 
 ---
 
 name:   unparsable_attributed
 label:  a tag that should not be put through the normal parser; parsed attributes
 input:  "{{t1}}<nowiki attr=val attr2="{{val2}}">{{t2}}</nowiki>{{t3}}"
-output: [TemplateOpen(), Text(text=u't1'), TemplateClose(), TagOpenOpen(showtag=True), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
+output: [TemplateOpen(), Text(text=u't1'), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
 
 ---
 
@@ -470,21 +470,21 @@ output: [TemplateOpen(), Text(text="t1"), TemplateClose(), Text(text="<nowiki>")
 name:   single_open_close
 label:  a tag that supports being single; both an open and a close tag
 input:  "foo<li>bar{{baz}}</li>"
-output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="li"), TagCloseOpen(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenClose(), Text(text="li"), TagCloseClose()]
+output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseOpen(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenClose(), Text(text="li"), TagCloseClose()]
 
 ---
 
 name:   single_open
 label:  a tag that supports being single; just an open tag
 input:  "foo<li>bar{{baz}}"
-output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="li"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
 
 ---
 
 name:   single_selfclose
 label:  a tag that supports being single; a self-closing tag
 input:  "foo<li/>bar{{baz}}"
-output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="li"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
 
 ---
 
@@ -498,32 +498,32 @@ output: [Text(text="foo</li>bar"), TemplateOpen(), Text(text="baz"), TemplateClo
 name:   single_only_open_close
 label:  a tag that can only be single; both an open and a close tag
 input:  "foo<br>bar{{baz}}</br>"
-output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True)]
+output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True)]
 
 ---
 
 name:   single_only_open
 label:  a tag that can only be single; just an open tag
 input:  "foo<br>bar{{baz}}"
-output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
 
 ---
 
 name:   single_only_selfclose
 label:  a tag that can only be single; a self-closing tag
 input:  "foo<br/>bar{{baz}}"
-output: [Text(text="foo"), TagOpenOpen(showtag=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
 
 ---
 
 name:   single_only_close
 label:  a tag that can only be single; just a close tag
 input:  "foo</br>bar{{baz}}"
-output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
 
 ---
 
 name:   single_only_double
 label:  a tag that can only be single; a tag with backslashes at the beginning and end
 input:  "foo</br/>bar{{baz}}"
-output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]
+output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

From f67cf46900aebf3bc07c8fb2814ec06c9701e05b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 10 Jul 2013 03:58:47 -0400
Subject: [PATCH 54/77] Start C port of tag tokenization; refactor the init
 func.

---
 mwparserfromhell/parser/tokenizer.c | 101 +++++++++++++++++++++++++++---------
 mwparserfromhell/parser/tokenizer.h |  78 ++++++++++++++++------------
 2 files changed, 121 insertions(+), 58 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 99f8c9c..e575d2e 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -35,6 +35,22 @@ static int heading_level_from_context(int n)
     return level;
 }
 
+/*
+    Call the given function in tag_defs, using 'tag' as a parameter, and return
+    its output as a bool.
+*/
+static int
+call_tag_def_func(const char* funcname, PyObject* tag)
+{
+    PyObject* func = PyObject_GetAttrString(tag_defs, funcname);
+    PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL);
+    int ans = (result == Py_True) ? 1 : 0;
+
+    Py_DECREF(func);
+    Py_DECREF(result);
+    return ans;
+}
+
 static PyObject*
 Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
 {
@@ -1418,22 +1434,11 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
     return Tokenizer_parse(self, 0);
 }
 
-PyMODINIT_FUNC
-init_tokenizer(void)
+static void
+load_entitydefs(void)
 {
-    PyObject *module, *tempmod, *defmap, *deflist, *globals, *locals,
-             *fromlist, *modname;
+    PyObject *tempmod, *defmap, *deflist;
     unsigned numdefs, i;
-    char *name;
-
-    TokenizerType.tp_new = PyType_GenericNew;
-    if (PyType_Ready(&TokenizerType) < 0)
-        return;
-    module = Py_InitModule("_tokenizer", module_methods);
-    Py_INCREF(&TokenizerType);
-    PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
-    Py_INCREF(Py_True);
-    PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
 
     tempmod = PyImport_ImportModule("htmlentitydefs");
     if (!tempmod)
@@ -1451,18 +1456,19 @@ init_tokenizer(void)
     for (i = 0; i < numdefs; i++)
         entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i));
     Py_DECREF(deflist);
+}
 
-    EMPTY = PyUnicode_FromString("");
-    NOARGS = PyTuple_New(0);
-
-    name = "mwparserfromhell.parser";
-    globals = PyEval_GetGlobals();
-    locals = PyEval_GetLocals();
-    fromlist = PyList_New(1);
-    if (!fromlist)
-        return;
-    modname = PyBytes_FromString("tokens");
-    if (!modname)
+static void
+load_tokens(void)
+{
+    PyObject *tempmod, *tokens,
+             *globals = PyEval_GetGlobals(),
+             *locals = PyEval_GetLocals(),
+             *fromlist = PyList_New(1),
+             *modname = PyBytes_FromString("tokens");
+    char *name = "mwparserfromhell.parser";
+
+    if (!fromlist || !modname)
         return;
     PyList_SET_ITEM(fromlist, 0, modname);
     tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
@@ -1508,4 +1514,49 @@ init_tokenizer(void)
     TagCloseSelfclose = PyObject_GetAttrString(tokens, "TagCloseSelfclose");
     TagOpenClose = PyObject_GetAttrString(tokens, "TagOpenClose");
     TagCloseClose = PyObject_GetAttrString(tokens, "TagCloseClose");
+
+    Py_DECREF(tokens);
+}
+
+static void
+load_tag_defs(void)
+{
+    PyObject *tempmod,
+             *globals = PyEval_GetGlobals(),
+             *locals = PyEval_GetLocals(),
+             *fromlist = PyList_New(1),
+             *modname = PyBytes_FromString("tag_defs");
+    char *name = "mwparserfromhell";
+
+    if (!fromlist || !modname)
+        return;
+    PyList_SET_ITEM(fromlist, 0, modname);
+    tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
+    Py_DECREF(fromlist);
+    if (!tempmod)
+        return;
+    tag_defs = PyObject_GetAttrString(tempmod, "tag_defs");
+    Py_DECREF(tempmod);
+}
+
+PyMODINIT_FUNC
+init_tokenizer(void)
+{
+    PyObject *module;
+
+    TokenizerType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&TokenizerType) < 0)
+        return;
+    module = Py_InitModule("_tokenizer", module_methods);
+    Py_INCREF(&TokenizerType);
+    PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
+    Py_INCREF(Py_True);
+    PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
+
+    EMPTY = PyUnicode_FromString("");
+    NOARGS = PyTuple_New(0);
+
+    load_entitydefs();
+    load_tokens();
+    load_tag_defs();
 }
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 1f58c49..c81c0bf 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -60,10 +60,10 @@ static char** entitydefs;
 
 static PyObject* EMPTY;
 static PyObject* NOARGS;
-static PyObject* tokens;
+static PyObject* tag_defs;
 
 
-/* Tokens */
+/* Tokens: */
 
 static PyObject* Text;
 
@@ -102,36 +102,42 @@ static PyObject* TagCloseClose;
 
 /* Local contexts: */
 
-#define LC_TEMPLATE             0x00007
-#define LC_TEMPLATE_NAME        0x00001
-#define LC_TEMPLATE_PARAM_KEY   0x00002
-#define LC_TEMPLATE_PARAM_VALUE 0x00004
-
-#define LC_ARGUMENT             0x00018
-#define LC_ARGUMENT_NAME        0x00008
-#define LC_ARGUMENT_DEFAULT     0x00010
-
-#define LC_WIKILINK             0x00060
-#define LC_WIKILINK_TITLE       0x00020
-#define LC_WIKILINK_TEXT        0x00040
-
-#define LC_HEADING              0x01F80
-#define LC_HEADING_LEVEL_1      0x00080
-#define LC_HEADING_LEVEL_2      0x00100
-#define LC_HEADING_LEVEL_3      0x00200
-#define LC_HEADING_LEVEL_4      0x00400
-#define LC_HEADING_LEVEL_5      0x00800
-#define LC_HEADING_LEVEL_6      0x01000
-
-#define LC_COMMENT              0x02000
-
-#define LC_SAFETY_CHECK         0xFC000
-#define LC_HAS_TEXT             0x04000
-#define LC_FAIL_ON_TEXT         0x08000
-#define LC_FAIL_NEXT            0x10000
-#define LC_FAIL_ON_LBRACE       0x20000
-#define LC_FAIL_ON_RBRACE       0x40000
-#define LC_FAIL_ON_EQUALS       0x80000
+#define LC_TEMPLATE             0x000007
+#define LC_TEMPLATE_NAME        0x000001
+#define LC_TEMPLATE_PARAM_KEY   0x000002
+#define LC_TEMPLATE_PARAM_VALUE 0x000004
+
+#define LC_ARGUMENT             0x000018
+#define LC_ARGUMENT_NAME        0x000008
+#define LC_ARGUMENT_DEFAULT     0x000010
+
+#define LC_WIKILINK             0x000060
+#define LC_WIKILINK_TITLE       0x000020
+#define LC_WIKILINK_TEXT        0x000040
+
+#define LC_HEADING              0x001F80
+#define LC_HEADING_LEVEL_1      0x000080
+#define LC_HEADING_LEVEL_2      0x000100
+#define LC_HEADING_LEVEL_3      0x000200
+#define LC_HEADING_LEVEL_4      0x000400
+#define LC_HEADING_LEVEL_5      0x000800
+#define LC_HEADING_LEVEL_6      0x001000
+
+#define LC_COMMENT              0x002000
+
+#define LC_TAG                  0x03C000
+#define LC_TAG_OPEN             0x004000
+#define LC_TAG_ATTR             0x008000
+#define LC_TAG_BODY             0x010000
+#define LC_TAG_CLOSE            0x020000
+
+#define LC_SAFETY_CHECK         0xFC0000
+#define LC_HAS_TEXT             0x040000
+#define LC_FAIL_ON_TEXT         0x080000
+#define LC_FAIL_NEXT            0x100000
+#define LC_FAIL_ON_LBRACE       0x200000
+#define LC_FAIL_ON_RBRACE       0x400000
+#define LC_FAIL_ON_EQUALS       0x800000
 
 /* Global contexts: */
 
@@ -179,9 +185,15 @@ typedef struct {
 #define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)
 
 
+/* Macros for accessing HTML tag definitions: */
+
+#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag))
+#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag))
+#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))
+
+
 /* Function prototypes: */
 
-static int heading_level_from_context(int);
 static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static struct Textbuffer* Textbuffer_new(void);
 static void Tokenizer_dealloc(Tokenizer*);

From aca0f78cd781a73fed81f151fb72257b7645ccb4 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 10 Jul 2013 15:28:58 -0400
Subject: [PATCH 55/77] Port more Python tokenizer updates to C.

---
 mwparserfromhell/parser/tokenizer.c | 134 ++++++++++++++++++------------------
 mwparserfromhell/parser/tokenizer.h |  38 +++++++---
 2 files changed, 97 insertions(+), 75 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index e575d2e..cab52b6 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -58,10 +58,10 @@ Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
     return (PyObject*) self;
 }
 
-static struct Textbuffer*
+static Textbuffer*
 Textbuffer_new(void)
 {
-    struct Textbuffer* buffer = malloc(sizeof(struct Textbuffer));
+    Textbuffer* buffer = malloc(sizeof(Textbuffer));
     if (!buffer) {
         PyErr_NoMemory();
         return NULL;
@@ -80,7 +80,7 @@ Textbuffer_new(void)
 static void
 Tokenizer_dealloc(Tokenizer* self)
 {
-    struct Stack *this = self->topstack, *next;
+    Stack *this = self->topstack, *next;
     Py_XDECREF(self->text);
 
     while (this) {
@@ -94,9 +94,9 @@ Tokenizer_dealloc(Tokenizer* self)
 }
 
 static void
-Textbuffer_dealloc(struct Textbuffer* this)
+Textbuffer_dealloc(Textbuffer* this)
 {
-    struct Textbuffer* next;
+    Textbuffer* next;
     while (this) {
         free(this->data);
         next = this->next;
@@ -126,7 +126,7 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
 static int
 Tokenizer_push(Tokenizer* self, int context)
 {
-    struct Stack* top = malloc(sizeof(struct Stack));
+    Stack* top = malloc(sizeof(Stack));
     if (!top) {
         PyErr_NoMemory();
         return -1;
@@ -147,7 +147,7 @@ Tokenizer_push(Tokenizer* self, int context)
     Return the contents of the textbuffer as a Python Unicode object.
 */
 static PyObject*
-Textbuffer_render(struct Textbuffer* self)
+Textbuffer_render(Textbuffer* self)
 {
     PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
     PyObject *left, *concat;
@@ -169,7 +169,7 @@ static int
 Tokenizer_push_textbuffer(Tokenizer* self)
 {
     PyObject *text, *kwargs, *token;
-    struct Textbuffer* buffer = self->topstack->textbuffer;
+    Textbuffer* buffer = self->topstack->textbuffer;
     if (buffer->size == 0 && !buffer->next)
         return 0;
     text = Textbuffer_render(buffer);
@@ -204,7 +204,7 @@ Tokenizer_push_textbuffer(Tokenizer* self)
 static void
 Tokenizer_delete_top_of_stack(Tokenizer* self)
 {
-    struct Stack* top = self->topstack;
+    Stack* top = self->topstack;
     Py_DECREF(top->stack);
     Textbuffer_dealloc(top->textbuffer);
     self->topstack = top->next;
@@ -263,7 +263,7 @@ Tokenizer_fail_route(Tokenizer* self)
     Write a token to the end of the current token stack.
 */
 static int
-Tokenizer_write(Tokenizer* self, PyObject* token)
+Tokenizer_emit(Tokenizer* self, PyObject* token)
 {
     if (Tokenizer_push_textbuffer(self))
         return -1;
@@ -276,7 +276,7 @@ Tokenizer_write(Tokenizer* self, PyObject* token)
     Write a token to the beginning of the current token stack.
 */
 static int
-Tokenizer_write_first(Tokenizer* self, PyObject* token)
+Tokenizer_emit_first(Tokenizer* self, PyObject* token)
 {
     if (Tokenizer_push_textbuffer(self))
         return -1;
@@ -289,11 +289,11 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token)
     Write text to the current textbuffer.
 */
 static int
-Tokenizer_write_text(Tokenizer* self, Py_UNICODE text)
+Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
 {
-    struct Textbuffer* buf = self->topstack->textbuffer;
+    Textbuffer* buf = self->topstack->textbuffer;
     if (buf->size == TEXTBUFFER_BLOCKSIZE) {
-        struct Textbuffer* new = Textbuffer_new();
+        Textbuffer* new = Textbuffer_new();
         if (!new)
             return -1;
         new->next = buf;
@@ -309,11 +309,11 @@ Tokenizer_write_text(Tokenizer* self, Py_UNICODE text)
     Write a series of tokens to the current stack at once.
 */
 static int
-Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
+Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
 {
     int pushed = 0;
     PyObject *stack, *token, *left, *right, *text;
-    struct Textbuffer* buffer;
+    Textbuffer* buffer;
     Py_ssize_t size;
 
     if (PyList_GET_SIZE(tokenlist) > 0) {
@@ -368,14 +368,14 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
     NULL-terminated array of chars.
 */
 static int
-Tokenizer_write_text_then_stack(Tokenizer* self, const char* text)
+Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
 {
     PyObject* stack = Tokenizer_pop(self);
     int i = 0;
     while (1) {
         if (!text[i])
             break;
-        if (Tokenizer_write_text(self, (Py_UNICODE) text[i])) {
+        if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
             Py_XDECREF(stack);
             return -1;
         }
@@ -383,7 +383,7 @@ Tokenizer_write_text_then_stack(Tokenizer* self, const char* text)
     }
     if (stack) {
         if (PyList_GET_SIZE(stack) > 0) {
-            if (Tokenizer_write_all(self, stack)) {
+            if (Tokenizer_emit_all(self, stack)) {
                 Py_DECREF(stack);
                 return -1;
             }
@@ -437,7 +437,7 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
         return -1;
     while (braces) {
         if (braces == 1) {
-            if (Tokenizer_write_text_then_stack(self, "{"))
+            if (Tokenizer_emit_text_then_stack(self, "{"))
                 return -1;
             return 0;
         }
@@ -447,7 +447,7 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
 
             if (BAD_ROUTE) {
                 RESET_ROUTE();
-                if (Tokenizer_write_text_then_stack(self, "{{"))
+                if (Tokenizer_emit_text_then_stack(self, "{{"))
                     return -1;
                 return 0;
             }
@@ -464,7 +464,7 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
                 RESET_ROUTE();
                 for (i = 0; i < braces; i++) text[i] = *"{";
                 text[braces] = *"";
-                if (Tokenizer_write_text_then_stack(self, text)) {
+                if (Tokenizer_emit_text_then_stack(self, text)) {
                     Py_XDECREF(text);
                     return -1;
                 }
@@ -482,7 +482,7 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
     tokenlist = Tokenizer_pop(self);
     if (!tokenlist)
         return -1;
-    if (Tokenizer_write_all(self, tokenlist)) {
+    if (Tokenizer_emit_all(self, tokenlist)) {
         Py_DECREF(tokenlist);
         return -1;
     }
@@ -511,13 +511,13 @@ Tokenizer_parse_template(Tokenizer* self)
         Py_DECREF(template);
         return -1;
     }
-    if (Tokenizer_write_first(self, token)) {
+    if (Tokenizer_emit_first(self, token)) {
         Py_DECREF(token);
         Py_DECREF(template);
         return -1;
     }
     Py_DECREF(token);
-    if (Tokenizer_write_all(self, template)) {
+    if (Tokenizer_emit_all(self, template)) {
         Py_DECREF(template);
         return -1;
     }
@@ -525,7 +525,7 @@ Tokenizer_parse_template(Tokenizer* self)
     token = PyObject_CallObject(TemplateClose, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -554,13 +554,13 @@ Tokenizer_parse_argument(Tokenizer* self)
         Py_DECREF(argument);
         return -1;
     }
-    if (Tokenizer_write_first(self, token)) {
+    if (Tokenizer_emit_first(self, token)) {
         Py_DECREF(token);
         Py_DECREF(argument);
         return -1;
     }
     Py_DECREF(token);
-    if (Tokenizer_write_all(self, argument)) {
+    if (Tokenizer_emit_all(self, argument)) {
         Py_DECREF(argument);
         return -1;
     }
@@ -568,7 +568,7 @@ Tokenizer_parse_argument(Tokenizer* self)
     token = PyObject_CallObject(ArgumentClose, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -592,7 +592,7 @@ Tokenizer_handle_template_param(Tokenizer* self)
         stack = Tokenizer_pop_keeping_context(self);
         if (!stack)
             return -1;
-        if (Tokenizer_write_all(self, stack)) {
+        if (Tokenizer_emit_all(self, stack)) {
             Py_DECREF(stack);
             return -1;
         }
@@ -604,7 +604,7 @@ Tokenizer_handle_template_param(Tokenizer* self)
     token = PyObject_CallObject(TemplateParamSeparator, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -625,7 +625,7 @@ Tokenizer_handle_template_param_value(Tokenizer* self)
     stack = Tokenizer_pop_keeping_context(self);
     if (!stack)
         return -1;
-    if (Tokenizer_write_all(self, stack)) {
+    if (Tokenizer_emit_all(self, stack)) {
         Py_DECREF(stack);
         return -1;
     }
@@ -635,7 +635,7 @@ Tokenizer_handle_template_param_value(Tokenizer* self)
     token = PyObject_CallObject(TemplateParamEquals, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -655,7 +655,7 @@ Tokenizer_handle_template_end(Tokenizer* self)
         stack = Tokenizer_pop_keeping_context(self);
         if (!stack)
             return NULL;
-        if (Tokenizer_write_all(self, stack)) {
+        if (Tokenizer_emit_all(self, stack)) {
             Py_DECREF(stack);
             return NULL;
         }
@@ -678,7 +678,7 @@ Tokenizer_handle_argument_separator(Tokenizer* self)
     token = PyObject_CallObject(ArgumentSeparator, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -714,7 +714,7 @@ Tokenizer_parse_wikilink(Tokenizer* self)
         RESET_ROUTE();
         self->head = reset;
         for (i = 0; i < 2; i++) {
-            if (Tokenizer_write_text(self, *"["))
+            if (Tokenizer_emit_text(self, *"["))
                 return -1;
         }
         return 0;
@@ -726,13 +726,13 @@ Tokenizer_parse_wikilink(Tokenizer* self)
         Py_DECREF(wikilink);
         return -1;
     }
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         Py_DECREF(wikilink);
         return -1;
     }
     Py_DECREF(token);
-    if (Tokenizer_write_all(self, wikilink)) {
+    if (Tokenizer_emit_all(self, wikilink)) {
         Py_DECREF(wikilink);
         return -1;
     }
@@ -740,7 +740,7 @@ Tokenizer_parse_wikilink(Tokenizer* self)
     token = PyObject_CallObject(WikilinkClose, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -760,7 +760,7 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self)
     token = PyObject_CallObject(WikilinkSeparator, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -802,7 +802,7 @@ Tokenizer_parse_heading(Tokenizer* self)
         RESET_ROUTE();
         self->head = reset + best - 1;
         for (i = 0; i < best; i++) {
-            if (Tokenizer_write_text(self, *"="))
+            if (Tokenizer_emit_text(self, *"="))
                 return -1;
         }
         self->global ^= GL_HEADING;
@@ -830,7 +830,7 @@ Tokenizer_parse_heading(Tokenizer* self)
         free(heading);
         return -1;
     }
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         Py_DECREF(heading->title);
         free(heading);
@@ -840,14 +840,14 @@ Tokenizer_parse_heading(Tokenizer* self)
     if (heading->level < best) {
         diff = best - heading->level;
         for (i = 0; i < diff; i++) {
-            if (Tokenizer_write_text(self, *"=")) {
+            if (Tokenizer_emit_text(self, *"=")) {
                 Py_DECREF(heading->title);
                 free(heading);
                 return -1;
             }
         }
     }
-    if (Tokenizer_write_all(self, heading->title)) {
+    if (Tokenizer_emit_all(self, heading->title)) {
         Py_DECREF(heading->title);
         free(heading);
         return -1;
@@ -857,7 +857,7 @@ Tokenizer_parse_heading(Tokenizer* self)
     token = PyObject_CallObject(HeadingEnd, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -892,7 +892,7 @@ Tokenizer_handle_heading_end(Tokenizer* self)
         if (level < best) {
             diff = best - level;
             for (i = 0; i < diff; i++) {
-                if (Tokenizer_write_text(self, *"="))
+                if (Tokenizer_emit_text(self, *"="))
                     return NULL;
             }
         }
@@ -900,13 +900,13 @@ Tokenizer_handle_heading_end(Tokenizer* self)
     }
     else {
         for (i = 0; i < best; i++) {
-            if (Tokenizer_write_text(self, *"=")) {
+            if (Tokenizer_emit_text(self, *"=")) {
                 Py_DECREF(after->title);
                 free(after);
                 return NULL;
             }
         }
-        if (Tokenizer_write_all(self, after->title)) {
+        if (Tokenizer_emit_all(self, after->title)) {
             Py_DECREF(after->title);
             free(after);
             return NULL;
@@ -948,7 +948,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
     token = PyObject_CallObject(HTMLEntityStart, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -964,7 +964,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
         token = PyObject_CallObject(HTMLEntityNumeric, NULL);
         if (!token)
             return -1;
-        if (Tokenizer_write(self, token)) {
+        if (Tokenizer_emit(self, token)) {
             Py_DECREF(token);
             return -1;
         }
@@ -985,7 +985,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
             Py_DECREF(kwargs);
             if (!token)
                 return -1;
-            if (Tokenizer_write(self, token)) {
+            if (Tokenizer_emit(self, token)) {
                 Py_DECREF(token);
                 return -1;
             }
@@ -1086,7 +1086,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
     Py_DECREF(kwargs);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -1094,7 +1094,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
     token = PyObject_CallObject(HTMLEntityEnd, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -1118,14 +1118,14 @@ Tokenizer_parse_entity(Tokenizer* self)
     if (BAD_ROUTE) {
         RESET_ROUTE();
         self->head = reset;
-        if (Tokenizer_write_text(self, *"&"))
+        if (Tokenizer_emit_text(self, *"&"))
             return -1;
         return 0;
     }
     tokenlist = Tokenizer_pop(self);
     if (!tokenlist)
         return -1;
-    if (Tokenizer_write_all(self, tokenlist)) {
+    if (Tokenizer_emit_all(self, tokenlist)) {
         Py_DECREF(tokenlist);
         return -1;
     }
@@ -1153,7 +1153,7 @@ Tokenizer_parse_comment(Tokenizer* self)
         while (1) {
             if (!text[i])
                 return 0;
-            if (Tokenizer_write_text(self, (Py_UNICODE) text[i])) {
+            if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
                 Py_XDECREF(text);
                 return -1;
             }
@@ -1168,13 +1168,13 @@ Tokenizer_parse_comment(Tokenizer* self)
         Py_DECREF(comment);
         return -1;
     }
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         Py_DECREF(comment);
         return -1;
     }
     Py_DECREF(token);
-    if (Tokenizer_write_all(self, comment)) {
+    if (Tokenizer_emit_all(self, comment)) {
         Py_DECREF(comment);
         return -1;
     }
@@ -1182,7 +1182,7 @@ Tokenizer_parse_comment(Tokenizer* self)
     token = PyObject_CallObject(CommentEnd, NULL);
     if (!token)
         return -1;
-    if (Tokenizer_write(self, token)) {
+    if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
         return -1;
     }
@@ -1303,7 +1303,7 @@ Tokenizer_parse(Tokenizer* self, int context)
             }
         }
         if (!is_marker) {
-            Tokenizer_write_text(self, this);
+            Tokenizer_emit_text(self, this);
             self->head++;
             continue;
         }
@@ -1322,7 +1322,7 @@ Tokenizer_parse(Tokenizer* self, int context)
                 if (Tokenizer_READ(self, 2) == *">")
                     return Tokenizer_pop(self);
             }
-            Tokenizer_write_text(self, this);
+            Tokenizer_emit_text(self, this);
         }
         else if (this == next && next == *"{") {
             if (Tokenizer_CAN_RECURSE(self)) {
@@ -1332,7 +1332,7 @@ Tokenizer_parse(Tokenizer* self, int context)
                     self->topstack->context ^= LC_FAIL_NEXT;
             }
             else
-                Tokenizer_write_text(self, this);
+                Tokenizer_emit_text(self, this);
         }
         else if (this == *"|" && this_context & LC_TEMPLATE) {
             if (Tokenizer_handle_template_param(self))
@@ -1352,7 +1352,7 @@ Tokenizer_parse(Tokenizer* self, int context)
             if (Tokenizer_READ(self, 2) == *"}") {
                 return Tokenizer_handle_argument_end(self);
             }
-            Tokenizer_write_text(self, this);
+            Tokenizer_emit_text(self, this);
         }
         else if (this == next && next == *"[") {
             if (!(this_context & LC_WIKILINK_TITLE) &&
@@ -1363,7 +1363,7 @@ Tokenizer_parse(Tokenizer* self, int context)
                     self->topstack->context ^= LC_FAIL_NEXT;
             }
             else
-                Tokenizer_write_text(self, this);
+                Tokenizer_emit_text(self, this);
         }
         else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
             if (Tokenizer_handle_wikilink_separator(self))
@@ -1378,7 +1378,7 @@ Tokenizer_parse(Tokenizer* self, int context)
                     return NULL;
             }
             else
-                Tokenizer_write_text(self, this);
+                Tokenizer_emit_text(self, this);
         }
         else if (this == *"=" && this_context & LC_HEADING)
             return (PyObject*) Tokenizer_handle_heading_end(self);
@@ -1395,10 +1395,10 @@ Tokenizer_parse(Tokenizer* self, int context)
                     return NULL;
             }
             else
-                Tokenizer_write_text(self, this);
+                Tokenizer_emit_text(self, this);
         }
         else
-            Tokenizer_write_text(self, this);
+            Tokenizer_emit_text(self, this);
         self->head++;
     }
 }
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index c81c0bf..bf7a4ed 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -41,10 +41,10 @@ SOFTWARE.
 #define ALPHANUM  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 static const char* MARKERS[] = {
-    "{",  "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
-    "!", "\n", ""};
+    "{",  "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/",
+    "\\", "\"", "-", "!", "\n", ""};
 
-#define NUM_MARKERS 18
+#define NUM_MARKERS 20
 #define TEXTBUFFER_BLOCKSIZE 1024
 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000
@@ -143,6 +143,17 @@ static PyObject* TagCloseClose;
 
 #define GL_HEADING 0x1
 
+/* Tag contexts: */
+
+#define TAG_NAME        0x01
+#define TAG_ATTR_READY  0x02
+#define TAG_ATTR_NAME   0x04
+#define TAG_ATTR_VALUE  0x08
+#define TAG_QUOTED      0x10
+#define TAG_NOTE_SPACE  0x20
+#define TAG_NOTE_EQUALS 0x40
+#define TAG_NOTE_QUOTE  0x80
+
 
 /* Miscellaneous structs: */
 
@@ -164,6 +175,17 @@ typedef struct {
     int level;
 } HeadingData;
 
+typedef struct {
+    int context;
+    struct Textbuffer* padding_first;
+    struct Textbuffer* padding_before_eq;
+    struct Textbuffer* padding_after_eq;
+    Py_ssize_t reset;
+} TagOpenData;
+
+typedef struct Textbuffer Textbuffer;
+typedef struct Stack Stack;
+
 
 /* Tokenizer object definition: */
 
@@ -206,11 +228,11 @@ static void Tokenizer_delete_top_of_stack(Tokenizer*);
 static PyObject* Tokenizer_pop(Tokenizer*);
 static PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
 static void* Tokenizer_fail_route(Tokenizer*);
-static int Tokenizer_write(Tokenizer*, PyObject*);
-static int Tokenizer_write_first(Tokenizer*, PyObject*);
-static int Tokenizer_write_text(Tokenizer*, Py_UNICODE);
-static int Tokenizer_write_all(Tokenizer*, PyObject*);
-static int Tokenizer_write_text_then_stack(Tokenizer*, const char*);
+static int Tokenizer_emit(Tokenizer*, PyObject*);
+static int Tokenizer_emit_first(Tokenizer*, PyObject*);
+static int Tokenizer_emit_text(Tokenizer*, Py_UNICODE);
+static int Tokenizer_emit_all(Tokenizer*, PyObject*);
+static int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);
 static PyObject* Tokenizer_read(Tokenizer*, Py_ssize_t);
 static PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
 static int Tokenizer_parse_template_or_argument(Tokenizer*);

From 653071379b7cd5d3c99931b4c43a7a45e112dca1 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 10 Jul 2013 18:48:12 -0400
Subject: [PATCH 56/77] Finish porting misc changes; add prototypes for
 remaining functions.

---
 mwparserfromhell/parser/tokenizer.c | 231 +++++++++++++++++++++++++++++++-----
 mwparserfromhell/parser/tokenizer.h |  18 ++-
 2 files changed, 217 insertions(+), 32 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index cab52b6..acb75e1 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -487,6 +487,8 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
         return -1;
     }
     Py_DECREF(tokenlist);
+    if (self->topstack->context & LC_FAIL_NEXT)
+        self->topstack->context ^= LC_FAIL_NEXT;
     return 0;
 }
 
@@ -499,7 +501,7 @@ Tokenizer_parse_template(Tokenizer* self)
     PyObject *template, *token;
     Py_ssize_t reset = self->head;
 
-    template = Tokenizer_parse(self, LC_TEMPLATE_NAME);
+    template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1);
     if (BAD_ROUTE) {
         self->head = reset;
         return 0;
@@ -542,7 +544,7 @@ Tokenizer_parse_argument(Tokenizer* self)
     PyObject *argument, *token;
     Py_ssize_t reset = self->head;
 
-    argument = Tokenizer_parse(self, LC_ARGUMENT_NAME);
+    argument = Tokenizer_parse(self, LC_ARGUMENT_NAME, 1);
     if (BAD_ROUTE) {
         self->head = reset;
         return 0;
@@ -709,7 +711,7 @@ Tokenizer_parse_wikilink(Tokenizer* self)
 
     self->head += 2;
     reset = self->head - 1;
-    wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE);
+    wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1);
     if (BAD_ROUTE) {
         RESET_ROUTE();
         self->head = reset;
@@ -745,6 +747,8 @@ Tokenizer_parse_wikilink(Tokenizer* self)
         return -1;
     }
     Py_DECREF(token);
+    if (self->topstack->context & LC_FAIL_NEXT)
+        self->topstack->context ^= LC_FAIL_NEXT;
     return 0;
 }
 
@@ -797,7 +801,7 @@ Tokenizer_parse_heading(Tokenizer* self)
         self->head++;
     }
     context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
-    heading = (HeadingData*) Tokenizer_parse(self, context);
+    heading = (HeadingData*) Tokenizer_parse(self, context, 1);
     if (BAD_ROUTE) {
         RESET_ROUTE();
         self->head = reset + best - 1;
@@ -886,7 +890,7 @@ Tokenizer_handle_heading_end(Tokenizer* self)
     current = heading_level_from_context(self->topstack->context);
     level = current > best ? (best > 6 ? 6 : best) :
                              (current > 6 ? 6 : current);
-    after = (HeadingData*) Tokenizer_parse(self, self->topstack->context);
+    after = (HeadingData*) Tokenizer_parse(self, self->topstack->context, 1);
     if (BAD_ROUTE) {
         RESET_ROUTE();
         if (level < best) {
@@ -1144,7 +1148,7 @@ Tokenizer_parse_comment(Tokenizer* self)
     int i;
 
     self->head += 4;
-    comment = Tokenizer_parse(self, LC_COMMENT);
+    comment = Tokenizer_parse(self, LC_COMMENT, 1);
     if (BAD_ROUTE) {
         const char* text = "<!--";
         RESET_ROUTE();
@@ -1192,6 +1196,156 @@ Tokenizer_parse_comment(Tokenizer* self)
 }
 
 /*
+    Parse an HTML tag at the head of the wikicode string.
+*/
+static int
+Tokenizer_parse_tag(Tokenizer* self)
+{
+    return 0;
+}
+
+/*
+    Actually parse an HTML tag, starting with the open (<foo>).
+*/
+static PyObject*
+Tokenizer_really_parse_tag(Tokenizer* self)
+{
+    return NULL;
+}
+
+/*
+    Write a pending tag attribute from data to the stack.
+*/
+static int
+Tokenizer_push_tag_buffer(Tokenizer* self, TagOpenData* data)
+{
+    return 0;
+}
+
+/*
+    Handle all sorts of text data inside of an HTML open tag.
+*/
+static int
+Tokenizer_handle_tag_data(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
+{
+    return 0;
+}
+
+/*
+    Handle whitespace inside of an HTML open tag.
+*/
+static int
+Tokenizer_handle_tag_space(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
+{
+    return 0;
+}
+
+/*
+    Handle regular text inside of an HTML open tag.
+*/
+static int
+Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
+{
+    return 0;
+}
+
+/*
+    Handle the body of an HTML tag that is parser-blacklisted.
+*/
+static PyObject*
+Tokenizer_handle_blacklisted_tag(Tokenizer* self)
+{
+    return NULL;
+}
+
+/*
+    Handle the closing of a open tag (<foo>).
+*/
+static int
+Tokenizer_handle_tag_close_open(Tokenizer* self, TagOpenData* data,
+                                PyObject* token)
+{
+    return 0;
+}
+
+/*
+    Handle the opening of a closing tag (</foo>).
+*/
+static int
+Tokenizer_handle_tag_open_close(Tokenizer* self)
+{
+    return 0;
+}
+
+/*
+    Handle the ending of a closing tag (</foo>).
+*/
+static PyObject*
+Tokenizer_handle_tag_close_close(Tokenizer* self)
+{
+    return NULL;
+}
+
+/*
+    Handle the (possible) start of an implicitly closing single tag.
+*/
+static int
+Tokenizer_handle_invalid_tag_start(Tokenizer* self)
+{
+    return 0;
+}
+
+/*
+    Handle the end of an implicitly closing single-only HTML tag.
+*/
+static PyObject*
+Tokenizer_handle_single_only_tag_end(Tokenizer* self)
+{
+    return NULL;
+}
+
+/*
+    Handle the stream end when inside a single-supporting HTML tag.
+*/
+static PyObject*
+Tokenizer_handle_single_tag_end(Tokenizer* self)
+{
+    return NULL;
+}
+
+/*
+    Handle the end of the stream of wikitext.
+*/
+static PyObject*
+Tokenizer_handle_end(Tokenizer* self, int context)
+{
+    static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
+                                LC_HEADING | LC_COMMENT);
+    static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
+    PyObject *token, *text, *trash;
+    int single;
+
+    if (context & fail_contexts) {
+        if (context & LC_TAG_BODY) {
+            token = PyList_GET_ITEM(self->topstack->stack, 1);
+            text = PyObject_GetAttrString(token, "text");
+            if (!text)
+                return NULL;
+            single = IS_SINGLE(text);
+            Py_DECREF(text);
+            if (single)
+                return Tokenizer_handle_single_tag_end(self);
+        }
+        else if (context & double_fail) {
+            trash = Tokenizer_pop(self);
+            Py_XDECREF(trash);
+        }
+        return Tokenizer_fail_route(self);
+    }
+    return Tokenizer_pop(self);
+}
+
+/*
     Make sure we are not trying to write an invalid character. Return 0 if
     everything is safe, or -1 if the route must be failed.
 */
@@ -1208,6 +1362,11 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
             return -1;
         return 0;
     }
+    if (context & LC_TAG_CLOSE) {
+        if (data == *"<")
+            return -1;
+        return 0;
+    }
     if (context & LC_TEMPLATE_NAME) {
         if (data == *"{" || data == *"}" || data == *"[") {
             self->topstack->context |= LC_FAIL_NEXT;
@@ -1267,32 +1426,33 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
 }
 
 /*
-    Parse the wikicode string, using context for when to stop.
+    Parse the wikicode string, using context for when to stop. If push is true,
+    we will push a new context, otherwise we won't and context will be ignored.
 */
 static PyObject*
-Tokenizer_parse(Tokenizer* self, int context)
+Tokenizer_parse(Tokenizer* self, int context, int push)
 {
-    static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
-                                LC_HEADING | LC_COMMENT);
     static int unsafe_contexts = (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE |
                                   LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME);
+    static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
     int this_context, is_marker, i;
     Py_UNICODE this, next, next_next, last;
-    PyObject *trash;
+    PyObject* trash;
 
-    if (Tokenizer_push(self, context))
-        return NULL;
+    if (push) {
+        if (Tokenizer_push(self, context))
+            return NULL;
+    }
     while (1) {
         this = Tokenizer_READ(self, 0);
         this_context = self->topstack->context;
         if (this_context & unsafe_contexts) {
             if (Tokenizer_verify_safe(self, this_context, this) < 0) {
-                if (this_context & LC_TEMPLATE_PARAM_KEY) {
+                if (this_context & double_unsafe) {
                     trash = Tokenizer_pop(self);
                     Py_XDECREF(trash);
                 }
-                Tokenizer_fail_route(self);
-                return NULL;
+                return Tokenizer_fail_route(self);
             }
         }
         is_marker = 0;
@@ -1307,15 +1467,8 @@ Tokenizer_parse(Tokenizer* self, int context)
             self->head++;
             continue;
         }
-        if (this == *"") {
-            if (this_context & LC_TEMPLATE_PARAM_KEY) {
-                trash = Tokenizer_pop(self);
-                Py_XDECREF(trash);
-            }
-            if (this_context & fail_contexts)
-                return Tokenizer_fail_route(self);
-            return Tokenizer_pop(self);
-        }
+        if (this == *"")
+            return Tokenizer_handle_end(self, this_context);
         next = Tokenizer_READ(self, 1);
         if (this_context & LC_COMMENT) {
             if (this == next && next == *"-") {
@@ -1328,8 +1481,6 @@ Tokenizer_parse(Tokenizer* self, int context)
             if (Tokenizer_CAN_RECURSE(self)) {
                 if (Tokenizer_parse_template_or_argument(self))
                     return NULL;
-                if (self->topstack->context & LC_FAIL_NEXT)
-                    self->topstack->context ^= LC_FAIL_NEXT;
             }
             else
                 Tokenizer_emit_text(self, this);
@@ -1359,8 +1510,6 @@ Tokenizer_parse(Tokenizer* self, int context)
                                                 Tokenizer_CAN_RECURSE(self)) {
                 if (Tokenizer_parse_wikilink(self))
                     return NULL;
-                if (self->topstack->context & LC_FAIL_NEXT)
-                    self->topstack->context ^= LC_FAIL_NEXT;
             }
             else
                 Tokenizer_emit_text(self, this);
@@ -1397,6 +1546,28 @@ Tokenizer_parse(Tokenizer* self, int context)
             else
                 Tokenizer_emit_text(self, this);
         }
+        else if (this == *"<" && next == *"/" &&
+                                            Tokenizer_READ(self, 2) != *"") {
+            if (this_context & LC_TAG_BODY) {
+                if (Tokenizer_handle_tag_open_close(self))
+                    return NULL;
+            }
+            else {
+                if (Tokenizer_handle_invalid_tag_start(self))
+                    return NULL;
+            }
+        }
+        else if (this == *"<") {
+            if (!(this_context & LC_TAG_CLOSE) &&
+                                                Tokenizer_CAN_RECURSE(self)) {
+                if (Tokenizer_parse_tag(self))
+                    return NULL;
+            }
+            else
+                Tokenizer_emit_text(self, this);
+        }
+        else if (this == *">" && this_context & LC_TAG_CLOSE)
+            return Tokenizer_handle_tag_close_close(self);
         else
             Tokenizer_emit_text(self, this);
         self->head++;
@@ -1431,7 +1602,7 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
         self->text = PySequence_Fast(text, "expected a sequence");
     }
     self->length = PyList_GET_SIZE(self->text);
-    return Tokenizer_parse(self, 0);
+    return Tokenizer_parse(self, 0, 1);
 }
 
 static void
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index bf7a4ed..6247f95 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -192,7 +192,7 @@ typedef struct Stack Stack;
 typedef struct {
     PyObject_HEAD
     PyObject* text;         /* text to tokenize */
-    struct Stack* topstack; /* topmost stack */
+    Stack* topstack;        /* topmost stack */
     Py_ssize_t head;        /* current position in text */
     Py_ssize_t length;      /* length of text */
     int global;             /* global context */
@@ -251,8 +251,22 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer*);
 static int Tokenizer_really_parse_entity(Tokenizer*);
 static int Tokenizer_parse_entity(Tokenizer*);
 static int Tokenizer_parse_comment(Tokenizer*);
+static int Tokenizer_parse_tag(Tokenizer*);
+static PyObject* Tokenizer_really_parse_tag(Tokenizer*);
+static int Tokenizer_push_tag_buffer(Tokenizer*, TagOpenData*);
+static int Tokenizer_handle_tag_data(Tokenizer*, TagOpenData*, Py_UNICODE);
+static int Tokenizer_handle_tag_space(Tokenizer*, TagOpenData*, Py_UNICODE);
+static int Tokenizer_handle_tag_text(Tokenizer*, Py_UNICODE);
+static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer*);
+static int Tokenizer_handle_tag_close_open(Tokenizer*, TagOpenData*, PyObject*);
+static int Tokenizer_handle_tag_open_close(Tokenizer*);
+static PyObject* Tokenizer_handle_tag_close_close(Tokenizer*);
+static int Tokenizer_handle_invalid_tag_start(Tokenizer*);
+static PyObject* Tokenizer_handle_single_only_tag_end(Tokenizer*);
+static PyObject* Tokenizer_handle_single_tag_end(Tokenizer*);
+static PyObject* Tokenizer_handle_end(Tokenizer*, int);
 static int Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE);
-static PyObject* Tokenizer_parse(Tokenizer*, int);
+static PyObject* Tokenizer_parse(Tokenizer*, int, int);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);
 
 

From c42f67ed5ed0197d8342d72457d9cc5851700216 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 10 Jul 2013 22:27:45 -0400
Subject: [PATCH 57/77] Implement Tokenizer_parse_tag(),
 Tokenizer_really_parse_tag()

---
 mwparserfromhell/parser/tokenizer.c | 101 +++++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index acb75e1..afaaf5a 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1201,6 +1201,20 @@ Tokenizer_parse_comment(Tokenizer* self)
 static int
 Tokenizer_parse_tag(Tokenizer* self)
 {
+    Py_ssize_t reset = self->head;
+    PyObject* tag;
+
+    self->head++;
+    tag = Tokenizer_really_parse_tag(self);
+    if (!tag) {
+        return -1;
+    }
+    if (BAD_ROUTE) {
+        self->head = reset;
+        return Tokenizer_emit_text(self, *"<");
+    }
+    Tokenizer_emit_all(self, tag);
+    Py_DECREF(tag);
     return 0;
 }
 
@@ -1210,7 +1224,92 @@ Tokenizer_parse_tag(Tokenizer* self)
 static PyObject*
 Tokenizer_really_parse_tag(Tokenizer* self)
 {
-    return NULL;
+    TagOpenData *data = malloc(sizeof(TagOpenData));
+    PyObject *token, *text, *trash;
+    Py_UNICODE this, next;
+    int can_exit;
+
+    if (!data)
+        return NULL;
+    data->padding_first = Textbuffer_new();
+    data->padding_before_eq = Textbuffer_new();
+    data->padding_after_eq = Textbuffer_new();
+    if (!data->padding_first || !data->padding_before_eq ||
+                                !data->padding_after_eq) {
+        free(data);
+        return NULL;
+    }
+    Tokenizer_push(self, LC_TAG_OPEN);
+    token = PyObject_CallObject(TagOpenOpen, NULL);
+    if (!token) {
+        free(data);
+        return NULL;
+    }
+    if (Tokenizer_emit(self, token)) {
+        Py_DECREF(token);
+        free(data);
+        return NULL;
+    }
+    Py_DECREF(token);
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        next = Tokenizer_READ(self, 1);
+        can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
+                    data->context & TAG_NOTE_SPACE);
+        if (this == *"") {
+            if (self->topstack->context & LC_TAG_ATTR) {
+                if (data->context & TAG_QUOTED) {
+                    // Unclosed attribute quote: reset, don't die
+                    data->context = TAG_ATTR_VALUE;
+                    trash = Tokenizer_pop(self);
+                    Py_XDECREF(trash);
+                    self->head = data->reset;
+                    continue;
+                }
+                trash = Tokenizer_pop(self);
+                Py_XDECREF(trash);
+            }
+            free(data);
+            return Tokenizer_fail_route(self);
+        }
+        else if (this == *">" && can_exit) {
+            if (Tokenizer_handle_tag_close_open(self, data, TagCloseOpen)) {
+                free(data);
+                return NULL;
+            }
+            free(data);
+            self->topstack->context = LC_TAG_BODY;
+            token = PyList_GET_ITEM(self->topstack->stack, 1);
+            text = PyObject_GetAttrString(token, "text");
+            if (!text)
+                return NULL;
+            if (IS_SINGLE_ONLY(text)) {
+                Py_DECREF(text);
+                return Tokenizer_handle_single_only_tag_end(self);
+            }
+            if (IS_PARSABLE(text)) {
+                Py_DECREF(text);
+                return Tokenizer_parse(self, 0, 0);
+            }
+            Py_DECREF(text);
+            return Tokenizer_handle_blacklisted_tag(self);
+        }
+        else if (this == *"/" && next == *">" && can_exit) {
+            if (Tokenizer_handle_tag_close_open(self, data, TagCloseSelfclose)) {
+                free(data);
+                return NULL;
+            }
+            free(data);
+            return Tokenizer_pop(self);
+        }
+        else {
+            if (Tokenizer_handle_tag_data(self, data, this)) {
+                free(data);
+                return NULL;
+            }
+        }
+        self->head++;
+    }
 }
 
 /*

From e636bf77cf20262eff467747a2d0236fb0eef52b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 11 Jul 2013 03:01:02 -0400
Subject: [PATCH 58/77] Implement Tokenizer_push_tag_buffer()

---
 mwparserfromhell/parser/tokenizer.c | 69 ++++++++++++++++++++++++++++++++++---
 mwparserfromhell/parser/tokenizer.h |  6 ++--
 2 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index afaaf5a..37736d5 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1231,11 +1231,10 @@ Tokenizer_really_parse_tag(Tokenizer* self)
 
     if (!data)
         return NULL;
-    data->padding_first = Textbuffer_new();
-    data->padding_before_eq = Textbuffer_new();
-    data->padding_after_eq = Textbuffer_new();
-    if (!data->padding_first || !data->padding_before_eq ||
-                                !data->padding_after_eq) {
+    data->pad_first = Textbuffer_new();
+    data->pad_before_eq = Textbuffer_new();
+    data->pad_after_eq = Textbuffer_new();
+    if (!data->pad_first || !data->pad_before_eq || !data->pad_after_eq) {
         free(data);
         return NULL;
     }
@@ -1318,6 +1317,66 @@ Tokenizer_really_parse_tag(Tokenizer* self)
 static int
 Tokenizer_push_tag_buffer(Tokenizer* self, TagOpenData* data)
 {
+    PyObject *token, *tokens, *kwargs, *pad_first, *pad_before_eq,
+             *pad_after_eq;
+
+    if (data->context & TAG_QUOTED) {
+        token = PyObject_CallObject(TagAttrQuote, NULL);
+        if (!token)
+            return -1;
+        if (Tokenizer_emit_first(self, token)) {
+            Py_DECREF(token);
+            return -1;
+        }
+        Py_DECREF(token);
+        tokens = Tokenizer_pop(self);
+        if (!tokens)
+            return -1;
+        if (Tokenizer_emit_all(self, tokens)) {
+            Py_DECREF(tokens);
+            return -1;
+        }
+        Py_DECREF(tokens);
+    }
+    pad_first = Textbuffer_render(data->pad_first);
+    pad_before_eq = Textbuffer_render(data->pad_before_eq);
+    pad_after_eq = Textbuffer_render(data->pad_after_eq);
+    if (!pad_first || !pad_before_eq || !pad_after_eq)
+        return -1;
+    kwargs = PyDict_New();
+    if (!kwargs)
+        return -1;
+    PyDict_SetItemString(kwargs, "pad_first", pad_first);
+    PyDict_SetItemString(kwargs, "pad_before_eq", pad_before_eq);
+    PyDict_SetItemString(kwargs, "pad_after_eq", pad_after_eq);
+    Py_DECREF(pad_first);
+    Py_DECREF(pad_before_eq);
+    Py_DECREF(pad_after_eq);
+    token = PyObject_Call(TagAttrStart, NOARGS, kwargs);
+    Py_DECREF(kwargs);
+    if (!token)
+        return -1;
+    if (Tokenizer_emit_first(self, token)) {
+        Py_DECREF(token);
+        return -1;
+    }
+    Py_DECREF(token);
+    tokens = Tokenizer_pop(self);
+    if (!tokens)
+        return -1;
+    if (Tokenizer_emit_all(self, tokens)) {
+        Py_DECREF(tokens);
+        return -1;
+    }
+    Py_DECREF(tokens);
+    Textbuffer_dealloc(data->pad_first);
+    Textbuffer_dealloc(data->pad_before_eq);
+    Textbuffer_dealloc(data->pad_after_eq);
+    data->pad_first = Textbuffer_new();
+    data->pad_before_eq = Textbuffer_new();
+    data->pad_after_eq = Textbuffer_new();
+    if (!data->pad_first || !data->pad_before_eq || !data->pad_after_eq)
+        return -1;
     return 0;
 }
 
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 6247f95..6ad0ab6 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -177,9 +177,9 @@ typedef struct {
 
 typedef struct {
     int context;
-    struct Textbuffer* padding_first;
-    struct Textbuffer* padding_before_eq;
-    struct Textbuffer* padding_after_eq;
+    struct Textbuffer* pad_first;
+    struct Textbuffer* pad_before_eq;
+    struct Textbuffer* pad_after_eq;
     Py_ssize_t reset;
 } TagOpenData;
 

From 9365fcf6e4c9ab07274833e5bcc67fa0ae86616b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 16 Jul 2013 23:22:14 -0400
Subject: [PATCH 59/77] Implement Tokenizer_handle_tag_data(); add a
 read-backwards macro.

---
 mwparserfromhell/parser/tokenizer.c | 99 ++++++++++++++++++++++++++++++++++---
 mwparserfromhell/parser/tokenizer.h |  2 +
 2 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 37736d5..6e21756 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1206,13 +1206,13 @@ Tokenizer_parse_tag(Tokenizer* self)
 
     self->head++;
     tag = Tokenizer_really_parse_tag(self);
-    if (!tag) {
-        return -1;
-    }
     if (BAD_ROUTE) {
         self->head = reset;
         return Tokenizer_emit_text(self, *"<");
     }
+    if (!tag) {
+        return -1;
+    }
     Tokenizer_emit_all(self, tag);
     Py_DECREF(tag);
     return 0;
@@ -1238,7 +1238,10 @@ Tokenizer_really_parse_tag(Tokenizer* self)
         free(data);
         return NULL;
     }
-    Tokenizer_push(self, LC_TAG_OPEN);
+    if (Tokenizer_push(self, LC_TAG_OPEN)) {
+        free(data);
+        return NULL;
+    }
     token = PyObject_CallObject(TagOpenOpen, NULL);
     if (!token) {
         free(data);
@@ -1302,7 +1305,7 @@ Tokenizer_really_parse_tag(Tokenizer* self)
             return Tokenizer_pop(self);
         }
         else {
-            if (Tokenizer_handle_tag_data(self, data, this)) {
+            if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
                 free(data);
                 return NULL;
             }
@@ -1384,9 +1387,89 @@ Tokenizer_push_tag_buffer(Tokenizer* self, TagOpenData* data)
     Handle all sorts of text data inside of an HTML open tag.
 */
 static int
-Tokenizer_handle_tag_data(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
+Tokenizer_handle_tag_data(Tokenizer* self, TagOpenData* data, Py_UNICODE chunk)
 {
-    return 0;
+    PyObject *trash, *token;
+    int first_time, i, is_marker = 0, escaped;
+
+    if (data->context & TAG_NAME) {
+        first_time = !(data->context & TAG_NOTE_SPACE);
+        for (i = 0; i < NUM_MARKERS; i++) {
+            if (*MARKERS[i] == chunk) {
+                is_marker = 1;
+                break;
+            }
+        }
+        if (is_marker || (Py_UNICODE_ISSPACE(chunk) && first_time)) {
+            // Tags must start with text, not spaces
+            Tokenizer_fail_route(self);
+            return 0;
+        }
+        else if (first_time)
+            data->context |= TAG_NOTE_SPACE;
+        else if (Py_UNICODE_ISSPACE(chunk))
+            data->context = TAG_ATTR_READY;
+    }
+    else if (Py_UNICODE_ISSPACE(chunk))
+        return Tokenizer_handle_tag_space(self, data, chunk);
+    else if (data->context & TAG_NOTE_SPACE) {
+        if (data->context & TAG_QUOTED) {
+            data->context = TAG_ATTR_VALUE;
+            trash = Tokenizer_pop(self);
+            Py_XDECREF(trash);
+            self->head = data->reset - 1;  // Will be auto-incremented
+        }
+        else
+            Tokenizer_fail_route(self);
+        return 0;
+    }
+    else if (data->context & TAG_ATTR_READY) {
+        data->context = TAG_ATTR_NAME;
+        if (Tokenizer_push(self, LC_TAG_ATTR))
+            return -1;
+    }
+    else if (data->context & TAG_ATTR_NAME) {
+        if (chunk == *"=") {
+            data->context = TAG_ATTR_VALUE | TAG_NOTE_QUOTE;
+            token = PyObject_CallObject(TagAttrEquals, NULL);
+            if (!token)
+                return -1;
+            if (Tokenizer_emit(self, token)) {
+                Py_DECREF(token);
+                return -1;
+            }
+            Py_DECREF(token);
+            return 0;
+        }
+        if (data->context & TAG_NOTE_EQUALS) {
+            if (Tokenizer_push_tag_buffer(self, data))
+                return -1;
+            data->context = TAG_ATTR_NAME;
+            if (Tokenizer_push(self, LC_TAG_ATTR))
+                return -1;
+        }
+    }
+    else if (data->context & TAG_ATTR_VALUE) {
+        escaped = (Tokenizer_READ_BACKWARDS(self, 1) == *"\\" &&
+                   Tokenizer_READ_BACKWARDS(self, 2) != *"\\");
+        if (data->context & TAG_NOTE_QUOTE) {
+            data->context ^= TAG_NOTE_QUOTE;
+            if (chunk == *"\"" && !escaped) {
+                data->context |= TAG_QUOTED;
+                if (Tokenizer_push(self, self->topstack->context))
+                    return -1;
+                data->reset = self->head;
+                return 0;
+            }
+        }
+        else if (data->context & TAG_QUOTED) {
+            if (chunk == *"\"" && !escaped) {
+                data->context |= TAG_NOTE_SPACE;
+                return 0;
+            }
+        }
+    }
+    return Tokenizer_handle_tag_text(self, chunk);
 }
 
 /*
@@ -1679,7 +1762,7 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
         else if (this == next && next == *"]" && this_context & LC_WIKILINK)
             return Tokenizer_handle_wikilink_end(self);
         else if (this == *"=" && !(self->global & GL_HEADING)) {
-            last = *PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1));
+            last = Tokenizer_READ_BACKWARDS(self, 1);
             if (last == *"\n" || last == *"") {
                 if (Tokenizer_parse_heading(self))
                     return NULL;
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 6ad0ab6..e52b147 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -204,6 +204,8 @@ typedef struct {
 /* Macros for accessing Tokenizer data: */
 
 #define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
+#define Tokenizer_READ_BACKWARDS(self, delta) \
+                (*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta)))
 #define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)
 
 

From d02a6da81e2a626b5f488094012f8c9e8658297f Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 17 Jul 2013 20:52:25 -0400
Subject: [PATCH 60/77] Implement Tokenizer_handle_tag_space(); refactor
 textbuffer writing.

- Add a test for very long strings of text.
---
 mwparserfromhell/parser/tokenizer.c | 164 ++++++++++++++++++++++--------------
 mwparserfromhell/parser/tokenizer.h |   8 +-
 tests/tokenizer/text.mwtest         |   7 ++
 3 files changed, 111 insertions(+), 68 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 6e21756..a0574c1 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -51,13 +51,6 @@ call_tag_def_func(const char* funcname, PyObject* tag)
     return ans;
 }
 
-static PyObject*
-Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
-{
-    Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
-    return (PyObject*) self;
-}
-
 static Textbuffer*
 Textbuffer_new(void)
 {
@@ -78,6 +71,63 @@ Textbuffer_new(void)
 }
 
 static void
+Textbuffer_dealloc(Textbuffer* self)
+{
+    Textbuffer* next;
+    while (self) {
+        free(self->data);
+        next = self->next;
+        free(self);
+        self = next;
+    }
+}
+
+/*
+    Write text to the given textbuffer.
+*/
+static int
+Textbuffer_write(Textbuffer** this, Py_UNICODE text)
+{
+    Textbuffer* self = *this;
+    if (self->size == TEXTBUFFER_BLOCKSIZE) {
+        Textbuffer* new = Textbuffer_new();
+        if (!new)
+            return -1;
+        new->next = self;
+        *this = self = new;
+    }
+    self->data[self->size] = text;
+    self->size++;
+    return 0;
+}
+
+/*
+    Return the contents of the textbuffer as a Python Unicode object.
+*/
+static PyObject*
+Textbuffer_render(Textbuffer* self)
+{
+    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
+    PyObject *left, *concat;
+    while (self->next) {
+        self = self->next;
+        left = PyUnicode_FromUnicode(self->data, self->size);
+        concat = PyUnicode_Concat(left, result);
+        Py_DECREF(left);
+        Py_DECREF(result);
+        result = concat;
+    }
+    return result;
+}
+
+static PyObject*
+Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
+{
+    Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
+    return (PyObject*) self;
+}
+
+static void
 Tokenizer_dealloc(Tokenizer* self)
 {
     Stack *this = self->topstack, *next;
@@ -93,18 +143,6 @@ Tokenizer_dealloc(Tokenizer* self)
     self->ob_type->tp_free((PyObject*) self);
 }
 
-static void
-Textbuffer_dealloc(Textbuffer* this)
-{
-    Textbuffer* next;
-    while (this) {
-        free(this->data);
-        next = this->next;
-        free(this);
-        this = next;
-    }
-}
-
 static int
 Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
 {
@@ -144,25 +182,6 @@ Tokenizer_push(Tokenizer* self, int context)
 }
 
 /*
-    Return the contents of the textbuffer as a Python Unicode object.
-*/
-static PyObject*
-Textbuffer_render(Textbuffer* self)
-{
-    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
-    PyObject *left, *concat;
-    while (self->next) {
-        self = self->next;
-        left = PyUnicode_FromUnicode(self->data, self->size);
-        concat = PyUnicode_Concat(left, result);
-        Py_DECREF(left);
-        Py_DECREF(result);
-        result = concat;
-    }
-    return result;
-}
-
-/*
     Push the textbuffer onto the stack as a Text node and clear it.
 */
 static int
@@ -291,18 +310,7 @@ Tokenizer_emit_first(Tokenizer* self, PyObject* token)
 static int
 Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
 {
-    Textbuffer* buf = self->topstack->textbuffer;
-    if (buf->size == TEXTBUFFER_BLOCKSIZE) {
-        Textbuffer* new = Textbuffer_new();
-        if (!new)
-            return -1;
-        new->next = buf;
-        self->topstack->textbuffer = new;
-        buf = new;
-    }
-    buf->data[buf->size] = text;
-    buf->size++;
-    return 0;
+    return Textbuffer_write(&(self->topstack->textbuffer), text);
 }
 
 /*
@@ -1478,6 +1486,29 @@ Tokenizer_handle_tag_data(Tokenizer* self, TagOpenData* data, Py_UNICODE chunk)
 static int
 Tokenizer_handle_tag_space(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
 {
+    int ctx = data->context;
+    int end_of_value = (ctx & TAG_ATTR_VALUE &&
+                        !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
+
+    if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
+        if (Tokenizer_push_tag_buffer(self, data))
+            return -1;
+        data->context = TAG_ATTR_READY;
+    }
+    else if (ctx & TAG_NOTE_SPACE)
+        data->context = TAG_ATTR_READY;
+    else if (ctx & TAG_ATTR_NAME) {
+        data->context |= TAG_NOTE_EQUALS;
+        Textbuffer_write(&(data->pad_before_eq), text);
+    }
+    if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
+        if (Tokenizer_emit_text(self, text))
+            return -1;
+    }
+    else if (data->context & TAG_ATTR_READY)
+        Textbuffer_write(&(data->pad_first), text);
+    else if (data->context & TAG_ATTR_VALUE)
+        Textbuffer_write(&(data->pad_after_eq), text);
     return 0;
 }
 
@@ -1704,7 +1735,8 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
             }
         }
         if (!is_marker) {
-            Tokenizer_emit_text(self, this);
+            if (Tokenizer_emit_text(self, this))
+                return NULL;
             self->head++;
             continue;
         }
@@ -1716,15 +1748,16 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
                 if (Tokenizer_READ(self, 2) == *">")
                     return Tokenizer_pop(self);
             }
-            Tokenizer_emit_text(self, this);
+            if (Tokenizer_emit_text(self, this))
+                return NULL;
         }
         else if (this == next && next == *"{") {
             if (Tokenizer_CAN_RECURSE(self)) {
                 if (Tokenizer_parse_template_or_argument(self))
                     return NULL;
             }
-            else
-                Tokenizer_emit_text(self, this);
+            else if (Tokenizer_emit_text(self, this))
+                return NULL;
         }
         else if (this == *"|" && this_context & LC_TEMPLATE) {
             if (Tokenizer_handle_template_param(self))
@@ -1744,7 +1777,8 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
             if (Tokenizer_READ(self, 2) == *"}") {
                 return Tokenizer_handle_argument_end(self);
             }
-            Tokenizer_emit_text(self, this);
+            if (Tokenizer_emit_text(self, this))
+                return NULL;
         }
         else if (this == next && next == *"[") {
             if (!(this_context & LC_WIKILINK_TITLE) &&
@@ -1752,8 +1786,8 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
                 if (Tokenizer_parse_wikilink(self))
                     return NULL;
             }
-            else
-                Tokenizer_emit_text(self, this);
+            else if (Tokenizer_emit_text(self, this))
+                return NULL;
         }
         else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
             if (Tokenizer_handle_wikilink_separator(self))
@@ -1767,8 +1801,8 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
                 if (Tokenizer_parse_heading(self))
                     return NULL;
             }
-            else
-                Tokenizer_emit_text(self, this);
+            else if (Tokenizer_emit_text(self, this))
+                return NULL;
         }
         else if (this == *"=" && this_context & LC_HEADING)
             return (PyObject*) Tokenizer_handle_heading_end(self);
@@ -1784,8 +1818,8 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
                 if (Tokenizer_parse_comment(self))
                     return NULL;
             }
-            else
-                Tokenizer_emit_text(self, this);
+            else if (Tokenizer_emit_text(self, this))
+                return NULL;
         }
         else if (this == *"<" && next == *"/" &&
                                             Tokenizer_READ(self, 2) != *"") {
@@ -1804,13 +1838,13 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
                 if (Tokenizer_parse_tag(self))
                     return NULL;
             }
-            else
-                Tokenizer_emit_text(self, this);
+            else if (Tokenizer_emit_text(self, this))
+                return NULL;
         }
         else if (this == *">" && this_context & LC_TAG_CLOSE)
             return Tokenizer_handle_tag_close_close(self);
-        else
-            Tokenizer_emit_text(self, this);
+        else if (Tokenizer_emit_text(self, this))
+            return NULL;
         self->head++;
     }
 }
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index e52b147..7440924 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -218,13 +218,15 @@ typedef struct {
 
 /* Function prototypes: */
 
-static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static struct Textbuffer* Textbuffer_new(void);
+static void Textbuffer_dealloc(Textbuffer*);
+static int Textbuffer_write(Textbuffer**, Py_UNICODE);
+static PyObject* Textbuffer_render(Textbuffer*);
+
+static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static void Tokenizer_dealloc(Tokenizer*);
-static void Textbuffer_dealloc(struct Textbuffer*);
 static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
 static int Tokenizer_push(Tokenizer*, int);
-static PyObject* Textbuffer_render(struct Textbuffer*);
 static int Tokenizer_push_textbuffer(Tokenizer*);
 static void Tokenizer_delete_top_of_stack(Tokenizer*);
 static PyObject* Tokenizer_pop(Tokenizer*);
diff --git a/tests/tokenizer/text.mwtest b/tests/tokenizer/text.mwtest
index 77d5f50..040c677 100644
--- a/tests/tokenizer/text.mwtest
+++ b/tests/tokenizer/text.mwtest
@@ -23,3 +23,10 @@ name:   unicode2
 label:  additional unicode check for non-BMP codepoints
 input:  "𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰"
 output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")]
+
+---
+
+name:   large
+label:  a lot of text, requiring multiple textbuffer blocks in the C tokenizer
+input:  "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN"
+output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")]

From b3f7728c313a18e7a10e6dd2a22cf518c1d39b17 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 17 Jul 2013 21:48:11 -0400
Subject: [PATCH 61/77] Implement Tokenizer_handle_tag_text(),
 Tokenizer_handle_blacklisted_tag()

---
 mwparserfromhell/parser/tokenizer.c | 43 ++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index a0574c1..6766780 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1499,16 +1499,17 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
         data->context = TAG_ATTR_READY;
     else if (ctx & TAG_ATTR_NAME) {
         data->context |= TAG_NOTE_EQUALS;
-        Textbuffer_write(&(data->pad_before_eq), text);
+        if (Textbuffer_write(&(data->pad_before_eq), text))
+            return -1;
     }
     if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
         if (Tokenizer_emit_text(self, text))
             return -1;
     }
     else if (data->context & TAG_ATTR_READY)
-        Textbuffer_write(&(data->pad_first), text);
+        return Textbuffer_write(&(data->pad_first), text);
     else if (data->context & TAG_ATTR_VALUE)
-        Textbuffer_write(&(data->pad_after_eq), text);
+        return Textbuffer_write(&(data->pad_after_eq), text);
     return 0;
 }
 
@@ -1518,7 +1519,24 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
 static int
 Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
 {
-    return 0;
+    Py_UNICODE next = Tokenizer_READ(self, 1);
+    int i, is_marker = 0;
+
+    for (i = 0; i < NUM_MARKERS; i++) {
+        if (*MARKERS[i] == text) {
+            is_marker = 1;
+            break;
+        }
+    }
+    if (!is_marker || !Tokenizer_CAN_RECURSE(self))
+        return Tokenizer_emit_text(self, text);
+    else if (text == next && next == *"{")
+        return Tokenizer_parse_template_or_argument(self);
+    else if (text == next && next == *"[")
+        return Tokenizer_parse_wikilink(self);
+    else if (text == *"<")
+        return Tokenizer_parse_tag(self);
+    return Tokenizer_emit_text(self, text);
 }
 
 /*
@@ -1527,7 +1545,22 @@ Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
 static PyObject*
 Tokenizer_handle_blacklisted_tag(Tokenizer* self)
 {
-    return NULL;
+    Py_UNICODE this, next;
+
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        next = Tokenizer_READ(self, 1);
+        self->head++;
+        if (this == *"")
+            return Tokenizer_fail_route(self);
+        else if (this == *"<" && next == *"/") {
+            if (Tokenizer_handle_tag_open_close(self))
+                return NULL;
+            return Tokenizer_parse(self, 0, 0);
+        }
+        if (Tokenizer_emit_text(self, this))
+            return NULL;
+    }
 }
 
 /*

From e6b5c46d9c5b6fd4ccb14da8276293b88f867a4d Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 18 Jul 2013 17:11:48 -0400
Subject: [PATCH 62/77] Implement Tokenizer_handle_tag_close_open()

---
 mwparserfromhell/parser/tokenizer.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 6766780..b6842b0 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1570,6 +1570,32 @@ static int
 Tokenizer_handle_tag_close_open(Tokenizer* self, TagOpenData* data,
                                 PyObject* token)
 {
+    PyObject *padding, *kwargs, *tok;
+
+    if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
+        if (Tokenizer_push_tag_buffer(self, data))
+            return -1;
+    }
+    padding = Textbuffer_render(data->pad_first);
+    if (!padding)
+        return -1;
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(padding);
+        return -1;
+    }
+    PyDict_SetItemString(kwargs, "padding", padding);
+    Py_DECREF(padding);
+    tok = PyObject_Call(token, NOARGS, kwargs);
+    Py_DECREF(kwargs);
+    if (!tok)
+        return -1;
+    if (Tokenizer_emit(self, tok)) {
+        Py_DECREF(tok);
+        return -1;
+    }
+    Py_DECREF(tok);
+    self->head++;
     return 0;
 }
 

From b900f8442032313cc8c4453b13927780675fd3fa Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 18 Jul 2013 17:18:06 -0400
Subject: [PATCH 63/77] Implement Tokenizer_handle_tag_open_close()

---
 mwparserfromhell/parser/tokenizer.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index b6842b0..cb2ecc1 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1237,8 +1237,10 @@ Tokenizer_really_parse_tag(Tokenizer* self)
     Py_UNICODE this, next;
     int can_exit;
 
-    if (!data)
+    if (!data) {
+        PyErr_NoMemory();
         return NULL;
+    }
     data->pad_first = Textbuffer_new();
     data->pad_before_eq = Textbuffer_new();
     data->pad_after_eq = Textbuffer_new();
@@ -1605,6 +1607,19 @@ Tokenizer_handle_tag_close_open(Tokenizer* self, TagOpenData* data,
 static int
 Tokenizer_handle_tag_open_close(Tokenizer* self)
 {
+    PyObject* token;
+
+    token = PyObject_CallObject(TagOpenClose, NULL);
+    if (!token)
+        return -1;
+    if (Tokenizer_emit(self, token)) {
+        Py_DECREF(token);
+        return -1;
+    }
+    Py_DECREF(token);
+    if (Tokenizer_push(self, LC_TAG_CLOSE))
+        return -1;
+    self->head++;
     return 0;
 }
 

From c0f83fc80359bba55fda4416f02d1aae99191ee9 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 21 Jul 2013 09:48:33 -0400
Subject: [PATCH 64/77] Implement Tokenizer_handle_tag_close_close()

---
 mwparserfromhell/parser/tokenizer.c | 71 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index cb2ecc1..062d05b 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -51,6 +51,26 @@ call_tag_def_func(const char* funcname, PyObject* tag)
     return ans;
 }
 
+/*
+    Sanitize the name of a tag so it can be compared with others for equality.
+*/
+static PyObject*
+strip_tag_name(PyObject* token)
+{
+    PyObject *text, *rstripped, *lowered;
+
+    text = PyObject_GetAttrString(token, "text");
+    if (!text)
+        return NULL;
+    rstripped = PyObject_CallMethod(text, "rstrip", NULL);
+    Py_DECREF(text);
+    if (!rstripped)
+        return NULL;
+    lowered = PyObject_CallMethod(rstripped, "rstrip", NULL);
+    Py_DECREF(rstripped);
+    return lowered;
+}
+
 static Textbuffer*
 Textbuffer_new(void)
 {
@@ -1629,7 +1649,56 @@ Tokenizer_handle_tag_open_close(Tokenizer* self)
 static PyObject*
 Tokenizer_handle_tag_close_close(Tokenizer* self)
 {
-    return NULL;
+    PyObject *closing, *first, *so, *sc, *token;
+    int valid = 1;
+
+    closing = Tokenizer_pop(self);
+    if (!closing)
+        return NULL;
+    if (PyList_GET_SIZE(closing) != 1)
+        valid = 0;
+    else {
+        first = PyList_GET_ITEM(closing, 0);
+        switch (PyObject_IsInstance(first, Text)) {
+            case 0:
+                valid = 0;
+                break;
+            case 1: {
+                so = strip_tag_name(first);
+                sc = strip_tag_name(PyList_GET_ITEM(self->topstack->stack, 1));
+                if (so && sc) {
+                    if (PyUnicode_Compare(so, sc))
+                        valid = 0;
+                    Py_DECREF(so);
+                    Py_DECREF(sc);
+                    break;
+                }
+                Py_XDECREF(so);
+                Py_XDECREF(sc);
+            }
+            case -1:
+                Py_DECREF(closing);
+                return NULL;
+        }
+    }
+    if (!valid) {
+        Py_DECREF(closing);
+        return Tokenizer_fail_route(self);
+    }
+    if (Tokenizer_emit_all(self, closing)) {
+        Py_DECREF(closing);
+        return NULL;
+    }
+    Py_DECREF(closing);
+    token = PyObject_CallObject(TagCloseClose, NULL);
+    if (!token)
+        return NULL;
+    if (Tokenizer_emit(self, token)) {
+        Py_DECREF(token);
+        return NULL;
+    }
+    Py_DECREF(token);
+    return Tokenizer_pop(self);
 }
 
 /*

From 4c5cfe57b86e767cc0603ca6fccc2e9af27f0957 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 21 Jul 2013 10:37:23 -0400
Subject: [PATCH 65/77] Implement Tokenizer_handle_invalid_tag_start()

---
 mwparserfromhell/parser/tokenizer.c | 46 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 062d05b..37c5639 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1707,7 +1707,51 @@ Tokenizer_handle_tag_close_close(Tokenizer* self)
 static int
 Tokenizer_handle_invalid_tag_start(Tokenizer* self)
 {
-    return 0;
+    Py_ssize_t reset = self->head + 1, pos = 0;
+    Textbuffer* buf;
+    PyObject *name, *tag;
+    Py_UNICODE this;
+    int is_marker, i;
+
+    self->head += 2;
+    buf = Textbuffer_new();
+    if (!buf)
+        return -1;
+    while (1) {
+        this = Tokenizer_READ(self, pos);
+        is_marker = 0;
+        for (i = 0; i < NUM_MARKERS; i++) {
+            if (*MARKERS[i] == this) {
+                is_marker = 1;
+                break;
+            }
+        }
+        if (is_marker) {
+            name = Textbuffer_render(buf);
+            if (!name) {
+                Textbuffer_dealloc(buf);
+                return -1;
+            }
+            if (!IS_SINGLE_ONLY(name))
+                FAIL_ROUTE();
+            break;
+        }
+        pos++;
+    }
+    if (!BAD_ROUTE) {
+        tag = Tokenizer_really_parse_tag(self);
+        if (!tag)
+            return -1;
+    }
+    if (BAD_ROUTE) {
+        self->head = reset;
+        return (Tokenizer_emit_text(self, *"<") ||
+                Tokenizer_emit_text(self, *"/"));
+    }
+    // Set invalid=True flag of TagOpenOpen
+    if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
+        return -1;
+    return Tokenizer_emit_all(self, tag);
 }
 
 /*

From fd8a530259173ac6bcd38a7134e41427e5949d1e Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 21 Jul 2013 11:14:49 -0400
Subject: [PATCH 66/77] Implement Tokenizer_handle_single_only_tag_end(),
 Tokenizer_handle_single_tag_end()

---
 mwparserfromhell/parser/tokenizer.c | 66 +++++++++++++++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 37c5639..b1c92f8 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1760,7 +1760,34 @@ Tokenizer_handle_invalid_tag_start(Tokenizer* self)
 static PyObject*
 Tokenizer_handle_single_only_tag_end(Tokenizer* self)
 {
-    return NULL;
+    PyObject *top, *padding, *kwargs, *token;
+
+    top = PyObject_CallMethod(self->topstack->stack, "pop", NULL);
+    if (!top)
+        return NULL;
+    padding = PyObject_GetAttrString(top, "padding");
+    Py_DECREF(top);
+    if (!padding)
+        return NULL;
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(padding);
+        return NULL;
+    }
+    PyDict_SetItemString(kwargs, "padding", padding);
+    PyDict_SetItemString(kwargs, "implicit", Py_True);
+    Py_DECREF(padding);
+    token = PyObject_Call(TagCloseSelfclose, NOARGS, kwargs);
+    Py_DECREF(kwargs);
+    if (!token)
+        return NULL;
+    if (Tokenizer_emit(self, token)) {
+        Py_DECREF(token);
+        return NULL;
+    }
+    Py_DECREF(token);
+    self->head--;  // Offset displacement done by handle_tag_close_open
+    return Tokenizer_pop(self);
 }
 
 /*
@@ -1769,7 +1796,42 @@ Tokenizer_handle_single_only_tag_end(Tokenizer* self)
 static PyObject*
 Tokenizer_handle_single_tag_end(Tokenizer* self)
 {
-    return NULL;
+    PyObject *token = 0, *padding, *kwargs;
+    Py_ssize_t len, index;
+    int is_instance;
+
+    len = PyList_GET_SIZE(self->topstack->stack);
+    for (index = 0; index < len; index++) {
+        token = PyList_GET_ITEM(self->topstack->stack, index);
+        is_instance = PyObject_IsInstance(token, TagCloseOpen);
+        if (is_instance == -1)
+            return NULL;
+        else if (is_instance == 1)
+            break;
+    }
+    if (!token)
+        return NULL;
+    padding = PyObject_GetAttrString(token, "padding");
+    if (!padding)
+        return NULL;
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(padding);
+        return NULL;
+    }
+    PyDict_SetItemString(kwargs, "padding", padding);
+    PyDict_SetItemString(kwargs, "implicit", Py_True);
+    Py_DECREF(padding);
+    token = PyObject_Call(TagCloseSelfclose, NOARGS, kwargs);
+    Py_DECREF(kwargs);
+    if (!token)
+        return NULL;
+    if (PyList_SetItem(self->topstack->stack, index, token)) {
+        Py_DECREF(token);
+        return NULL;
+    }
+    Py_DECREF(token);
+    return Tokenizer_pop(self);
 }
 
 /*

From e32063ef4a0aee66b6478e028aa85c3c7323f131 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 23 Jul 2013 16:31:56 -0400
Subject: [PATCH 67/77] Reset routes properly.

---
 mwparserfromhell/parser/tokenizer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index b1c92f8..5649803 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -472,7 +472,6 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
         if (braces == 2) {
             if (Tokenizer_parse_template(self))
                 return -1;
-
             if (BAD_ROUTE) {
                 RESET_ROUTE();
                 if (Tokenizer_emit_text_then_stack(self, "{{"))
@@ -1235,6 +1234,7 @@ Tokenizer_parse_tag(Tokenizer* self)
     self->head++;
     tag = Tokenizer_really_parse_tag(self);
     if (BAD_ROUTE) {
+        RESET_ROUTE();
         self->head = reset;
         return Tokenizer_emit_text(self, *"<");
     }
@@ -1336,6 +1336,7 @@ Tokenizer_really_parse_tag(Tokenizer* self)
         }
         else {
             if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
+                RESET_ROUTE();
                 free(data);
                 return NULL;
             }
@@ -1744,6 +1745,7 @@ Tokenizer_handle_invalid_tag_start(Tokenizer* self)
             return -1;
     }
     if (BAD_ROUTE) {
+        RESET_ROUTE();
         self->head = reset;
         return (Tokenizer_emit_text(self, *"<") ||
                 Tokenizer_emit_text(self, *"/"));

From 30c3f27b0e056afcbf326c20351e62927d453664 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 23 Jul 2013 16:58:28 -0400
Subject: [PATCH 68/77] Correctly set TagOpenData's default values.

---
 mwparserfromhell/parser/tokenizer.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 5649803..a02b053 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1261,6 +1261,7 @@ Tokenizer_really_parse_tag(Tokenizer* self)
         PyErr_NoMemory();
         return NULL;
     }
+    data->context = TAG_NAME;
     data->pad_first = Textbuffer_new();
     data->pad_before_eq = Textbuffer_new();
     data->pad_after_eq = Textbuffer_new();
@@ -1268,6 +1269,7 @@ Tokenizer_really_parse_tag(Tokenizer* self)
         free(data);
         return NULL;
     }
+    data->reset = 0;
     if (Tokenizer_push(self, LC_TAG_OPEN)) {
         free(data);
         return NULL;

From e3fc27c9e3ee35ed2a918a711fb0ea9a002e1704 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 23 Jul 2013 17:34:19 -0400
Subject: [PATCH 69/77] Refactor TagData code into dedicated functions.

---
 mwparserfromhell/parser/tokenizer.c | 117 +++++++++++++++++++++++-------------
 mwparserfromhell/parser/tokenizer.h |  16 +++--
 2 files changed, 86 insertions(+), 47 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index a02b053..b8d2ad1 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -140,6 +140,58 @@ Textbuffer_render(Textbuffer* self)
     return result;
 }
 
+static TagData*
+TagData_new(void)
+{
+    TagData *self = malloc(sizeof(TagData));
+
+    #define ALLOC_BUFFER(name)     \
+        name = Textbuffer_new();   \
+        if (!name) {               \
+            TagData_dealloc(self); \
+            return NULL;           \
+        }
+
+    if (!self) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    self->context = TAG_NAME;
+    ALLOC_BUFFER(self->pad_first)
+    ALLOC_BUFFER(self->pad_before_eq)
+    ALLOC_BUFFER(self->pad_after_eq)
+    self->reset = 0;
+    return self;
+}
+
+static void
+TagData_dealloc(TagData* self)
+{
+    #define DEALLOC_BUFFER(name) \
+        if (name)                \
+            Textbuffer_dealloc(name);
+
+    DEALLOC_BUFFER(self->pad_first);
+    DEALLOC_BUFFER(self->pad_before_eq);
+    DEALLOC_BUFFER(self->pad_after_eq);
+    free(self);
+}
+
+static int
+TagData_reset_buffers(TagData* self)
+{
+    #define RESET_BUFFER(name)    \
+        Textbuffer_dealloc(name); \
+        name = Textbuffer_new();  \
+        if (!name)                \
+            return -1;
+
+    RESET_BUFFER(self->pad_first)
+    RESET_BUFFER(self->pad_before_eq)
+    RESET_BUFFER(self->pad_after_eq)
+    return 0;
+}
+
 static PyObject*
 Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
 {
@@ -1252,36 +1304,25 @@ Tokenizer_parse_tag(Tokenizer* self)
 static PyObject*
 Tokenizer_really_parse_tag(Tokenizer* self)
 {
-    TagOpenData *data = malloc(sizeof(TagOpenData));
+    TagData *data = TagData_new();
     PyObject *token, *text, *trash;
     Py_UNICODE this, next;
     int can_exit;
 
-    if (!data) {
-        PyErr_NoMemory();
+    if (!data)
         return NULL;
-    }
-    data->context = TAG_NAME;
-    data->pad_first = Textbuffer_new();
-    data->pad_before_eq = Textbuffer_new();
-    data->pad_after_eq = Textbuffer_new();
-    if (!data->pad_first || !data->pad_before_eq || !data->pad_after_eq) {
-        free(data);
-        return NULL;
-    }
-    data->reset = 0;
     if (Tokenizer_push(self, LC_TAG_OPEN)) {
-        free(data);
+        TagData_dealloc(data);
         return NULL;
     }
     token = PyObject_CallObject(TagOpenOpen, NULL);
     if (!token) {
-        free(data);
+        TagData_dealloc(data);
         return NULL;
     }
     if (Tokenizer_emit(self, token)) {
         Py_DECREF(token);
-        free(data);
+        TagData_dealloc(data);
         return NULL;
     }
     Py_DECREF(token);
@@ -1303,15 +1344,15 @@ Tokenizer_really_parse_tag(Tokenizer* self)
                 trash = Tokenizer_pop(self);
                 Py_XDECREF(trash);
             }
-            free(data);
+            TagData_dealloc(data);
             return Tokenizer_fail_route(self);
         }
         else if (this == *">" && can_exit) {
             if (Tokenizer_handle_tag_close_open(self, data, TagCloseOpen)) {
-                free(data);
+                TagData_dealloc(data);
                 return NULL;
             }
-            free(data);
+            TagData_dealloc(data);
             self->topstack->context = LC_TAG_BODY;
             token = PyList_GET_ITEM(self->topstack->stack, 1);
             text = PyObject_GetAttrString(token, "text");
@@ -1329,17 +1370,18 @@ Tokenizer_really_parse_tag(Tokenizer* self)
             return Tokenizer_handle_blacklisted_tag(self);
         }
         else if (this == *"/" && next == *">" && can_exit) {
-            if (Tokenizer_handle_tag_close_open(self, data, TagCloseSelfclose)) {
-                free(data);
+            if (Tokenizer_handle_tag_close_open(self, data,
+                                                TagCloseSelfclose)) {
+                TagData_dealloc(data);
                 return NULL;
             }
-            free(data);
+            TagData_dealloc(data);
             return Tokenizer_pop(self);
         }
         else {
             if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
                 RESET_ROUTE();
-                free(data);
+                TagData_dealloc(data);
                 return NULL;
             }
         }
@@ -1351,7 +1393,7 @@ Tokenizer_really_parse_tag(Tokenizer* self)
     Write a pending tag attribute from data to the stack.
 */
 static int
-Tokenizer_push_tag_buffer(Tokenizer* self, TagOpenData* data)
+Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
 {
     PyObject *token, *tokens, *kwargs, *pad_first, *pad_before_eq,
              *pad_after_eq;
@@ -1405,13 +1447,7 @@ Tokenizer_push_tag_buffer(Tokenizer* self, TagOpenData* data)
         return -1;
     }
     Py_DECREF(tokens);
-    Textbuffer_dealloc(data->pad_first);
-    Textbuffer_dealloc(data->pad_before_eq);
-    Textbuffer_dealloc(data->pad_after_eq);
-    data->pad_first = Textbuffer_new();
-    data->pad_before_eq = Textbuffer_new();
-    data->pad_after_eq = Textbuffer_new();
-    if (!data->pad_first || !data->pad_before_eq || !data->pad_after_eq)
+    if (TagData_reset_buffers(data))
         return -1;
     return 0;
 }
@@ -1420,7 +1456,7 @@ Tokenizer_push_tag_buffer(Tokenizer* self, TagOpenData* data)
     Handle all sorts of text data inside of an HTML open tag.
 */
 static int
-Tokenizer_handle_tag_data(Tokenizer* self, TagOpenData* data, Py_UNICODE chunk)
+Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
 {
     PyObject *trash, *token;
     int first_time, i, is_marker = 0, escaped;
@@ -1509,7 +1545,7 @@ Tokenizer_handle_tag_data(Tokenizer* self, TagOpenData* data, Py_UNICODE chunk)
     Handle whitespace inside of an HTML open tag.
 */
 static int
-Tokenizer_handle_tag_space(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
+Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
 {
     int ctx = data->context;
     int end_of_value = (ctx & TAG_ATTR_VALUE &&
@@ -1592,10 +1628,9 @@ Tokenizer_handle_blacklisted_tag(Tokenizer* self)
     Handle the closing of a open tag (<foo>).
 */
 static int
-Tokenizer_handle_tag_close_open(Tokenizer* self, TagOpenData* data,
-                                PyObject* token)
+Tokenizer_handle_tag_close_open(Tokenizer* self, TagData* data, PyObject* cls)
 {
-    PyObject *padding, *kwargs, *tok;
+    PyObject *padding, *kwargs, *token;
 
     if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
         if (Tokenizer_push_tag_buffer(self, data))
@@ -1611,15 +1646,15 @@ Tokenizer_handle_tag_close_open(Tokenizer* self, TagOpenData* data,
     }
     PyDict_SetItemString(kwargs, "padding", padding);
     Py_DECREF(padding);
-    tok = PyObject_Call(token, NOARGS, kwargs);
+    token = PyObject_Call(cls, NOARGS, kwargs);
     Py_DECREF(kwargs);
-    if (!tok)
+    if (!token)
         return -1;
-    if (Tokenizer_emit(self, tok)) {
-        Py_DECREF(tok);
+    if (Tokenizer_emit(self, token)) {
+        Py_DECREF(token);
         return -1;
     }
-    Py_DECREF(tok);
+    Py_DECREF(token);
     self->head++;
     return 0;
 }
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 7440924..20934fa 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -181,7 +181,7 @@ typedef struct {
     struct Textbuffer* pad_before_eq;
     struct Textbuffer* pad_after_eq;
     Py_ssize_t reset;
-} TagOpenData;
+} TagData;
 
 typedef struct Textbuffer Textbuffer;
 typedef struct Stack Stack;
@@ -218,11 +218,15 @@ typedef struct {
 
 /* Function prototypes: */
 
-static struct Textbuffer* Textbuffer_new(void);
+static Textbuffer* Textbuffer_new(void);
 static void Textbuffer_dealloc(Textbuffer*);
 static int Textbuffer_write(Textbuffer**, Py_UNICODE);
 static PyObject* Textbuffer_render(Textbuffer*);
 
+static TagData* TagData_new(void);
+static void TagData_dealloc(TagData*);
+static int TagData_reset_buffers(TagData*);
+
 static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static void Tokenizer_dealloc(Tokenizer*);
 static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
@@ -257,12 +261,12 @@ static int Tokenizer_parse_entity(Tokenizer*);
 static int Tokenizer_parse_comment(Tokenizer*);
 static int Tokenizer_parse_tag(Tokenizer*);
 static PyObject* Tokenizer_really_parse_tag(Tokenizer*);
-static int Tokenizer_push_tag_buffer(Tokenizer*, TagOpenData*);
-static int Tokenizer_handle_tag_data(Tokenizer*, TagOpenData*, Py_UNICODE);
-static int Tokenizer_handle_tag_space(Tokenizer*, TagOpenData*, Py_UNICODE);
+static int Tokenizer_push_tag_buffer(Tokenizer*, TagData*);
+static int Tokenizer_handle_tag_data(Tokenizer*, TagData*, Py_UNICODE);
+static int Tokenizer_handle_tag_space(Tokenizer*, TagData*, Py_UNICODE);
 static int Tokenizer_handle_tag_text(Tokenizer*, Py_UNICODE);
 static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer*);
-static int Tokenizer_handle_tag_close_open(Tokenizer*, TagOpenData*, PyObject*);
+static int Tokenizer_handle_tag_close_open(Tokenizer*, TagData*, PyObject*);
 static int Tokenizer_handle_tag_open_close(Tokenizer*);
 static PyObject* Tokenizer_handle_tag_close_close(Tokenizer*);
 static int Tokenizer_handle_invalid_tag_start(Tokenizer*);

From 90ef3206f3b547867e0044bf4ebc7609a2bbab7b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 23 Jul 2013 17:40:09 -0400
Subject: [PATCH 70/77] Properly handle the first space after a tag name.

---
 mwparserfromhell/parser/tokenizer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index b8d2ad1..36a97b0 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1476,8 +1476,10 @@ Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
         }
         else if (first_time)
             data->context |= TAG_NOTE_SPACE;
-        else if (Py_UNICODE_ISSPACE(chunk))
+        else if (Py_UNICODE_ISSPACE(chunk)) {
             data->context = TAG_ATTR_READY;
+            return Tokenizer_handle_tag_space(self, data, chunk);
+        }
     }
     else if (Py_UNICODE_ISSPACE(chunk))
         return Tokenizer_handle_tag_space(self, data, chunk);

From 2f55654f01b29fb38400c3140e2208b307d3389a Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 23 Jul 2013 17:51:55 -0400
Subject: [PATCH 71/77] Route failure shouldn't be consumed before it reaches
 Tokenizer_parse_tag().

---
 mwparserfromhell/parser/tokenizer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 36a97b0..05f0d74 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1380,7 +1380,6 @@ Tokenizer_really_parse_tag(Tokenizer* self)
         }
         else {
             if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
-                RESET_ROUTE();
                 TagData_dealloc(data);
                 return NULL;
             }

From bb4cf03e98eceea8f89c5725c0ae44a07d2efd4b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 23 Jul 2013 18:02:42 -0400
Subject: [PATCH 72/77] Fix unsafe context definitions.

---
 mwparserfromhell/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 05f0d74..6f7b3ab 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1881,7 +1881,7 @@ static PyObject*
 Tokenizer_handle_end(Tokenizer* self, int context)
 {
     static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
-                                LC_HEADING | LC_COMMENT);
+                                LC_HEADING | LC_COMMENT | LC_TAG);
     static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
     PyObject *token, *text, *trash;
     int single;

From 41d66a4a7f3af69e5d612dca3f4e2760724c814f Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 23 Jul 2013 18:11:37 -0400
Subject: [PATCH 73/77] Don't decrement refcount with PyList_SetItem().

---
 mwparserfromhell/parser/tokenizer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 6f7b3ab..4a0f3f7 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1870,7 +1870,6 @@ Tokenizer_handle_single_tag_end(Tokenizer* self)
         Py_DECREF(token);
         return NULL;
     }
-    Py_DECREF(token);
     return Tokenizer_pop(self);
 }
 

From 9a2556b9fa8218bb2f2571ff6a29ad9202799a10 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 23 Jul 2013 18:21:56 -0400
Subject: [PATCH 74/77] Actually write to buffer with invalid tag starts.

---
 mwparserfromhell/parser/tokenizer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 4a0f3f7..3d6b849 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1775,6 +1775,7 @@ Tokenizer_handle_invalid_tag_start(Tokenizer* self)
                 FAIL_ROUTE();
             break;
         }
+        Textbuffer_write(&buf, this);
         pos++;
     }
     if (!BAD_ROUTE) {

From e83f32134026c433e6e3cc93994f7d2291ef821e Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 26 Jul 2013 23:19:38 -0400
Subject: [PATCH 75/77] Rearrange functions; remove useless prototypes.

---
 mwparserfromhell/parser/tokenizer.c  | 848 ++++++++++++++++-------------------
 mwparserfromhell/parser/tokenizer.h  |  46 --
 mwparserfromhell/parser/tokenizer.py | 260 +++++------
 3 files changed, 525 insertions(+), 629 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 3d6b849..c1d05e5 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -39,8 +39,7 @@ static int heading_level_from_context(int n)
     Call the given function in tag_defs, using 'tag' as a parameter, and return
     its output as a bool.
 */
-static int
-call_tag_def_func(const char* funcname, PyObject* tag)
+static int call_tag_def_func(const char* funcname, PyObject* tag)
 {
     PyObject* func = PyObject_GetAttrString(tag_defs, funcname);
     PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL);
@@ -54,8 +53,7 @@ call_tag_def_func(const char* funcname, PyObject* tag)
 /*
     Sanitize the name of a tag so it can be compared with others for equality.
 */
-static PyObject*
-strip_tag_name(PyObject* token)
+static PyObject* strip_tag_name(PyObject* token)
 {
     PyObject *text, *rstripped, *lowered;
 
@@ -71,8 +69,7 @@ strip_tag_name(PyObject* token)
     return lowered;
 }
 
-static Textbuffer*
-Textbuffer_new(void)
+static Textbuffer* Textbuffer_new(void)
 {
     Textbuffer* buffer = malloc(sizeof(Textbuffer));
     if (!buffer) {
@@ -90,8 +87,7 @@ Textbuffer_new(void)
     return buffer;
 }
 
-static void
-Textbuffer_dealloc(Textbuffer* self)
+static void Textbuffer_dealloc(Textbuffer* self)
 {
     Textbuffer* next;
     while (self) {
@@ -105,8 +101,7 @@ Textbuffer_dealloc(Textbuffer* self)
 /*
     Write text to the given textbuffer.
 */
-static int
-Textbuffer_write(Textbuffer** this, Py_UNICODE text)
+static int Textbuffer_write(Textbuffer** this, Py_UNICODE text)
 {
     Textbuffer* self = *this;
     if (self->size == TEXTBUFFER_BLOCKSIZE) {
@@ -124,8 +119,7 @@ Textbuffer_write(Textbuffer** this, Py_UNICODE text)
 /*
     Return the contents of the textbuffer as a Python Unicode object.
 */
-static PyObject*
-Textbuffer_render(Textbuffer* self)
+static PyObject* Textbuffer_render(Textbuffer* self)
 {
     PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
     PyObject *left, *concat;
@@ -140,8 +134,7 @@ Textbuffer_render(Textbuffer* self)
     return result;
 }
 
-static TagData*
-TagData_new(void)
+static TagData* TagData_new(void)
 {
     TagData *self = malloc(sizeof(TagData));
 
@@ -164,8 +157,7 @@ TagData_new(void)
     return self;
 }
 
-static void
-TagData_dealloc(TagData* self)
+static void TagData_dealloc(TagData* self)
 {
     #define DEALLOC_BUFFER(name) \
         if (name)                \
@@ -177,8 +169,7 @@ TagData_dealloc(TagData* self)
     free(self);
 }
 
-static int
-TagData_reset_buffers(TagData* self)
+static int TagData_reset_buffers(TagData* self)
 {
     #define RESET_BUFFER(name)    \
         Textbuffer_dealloc(name); \
@@ -199,8 +190,7 @@ Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
     return (PyObject*) self;
 }
 
-static void
-Tokenizer_dealloc(Tokenizer* self)
+static void Tokenizer_dealloc(Tokenizer* self)
 {
     Stack *this = self->topstack, *next;
     Py_XDECREF(self->text);
@@ -215,8 +205,7 @@ Tokenizer_dealloc(Tokenizer* self)
     self->ob_type->tp_free((PyObject*) self);
 }
 
-static int
-Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
+static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
 {
     static char* kwlist[] = {NULL};
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
@@ -233,8 +222,7 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
 /*
     Add a new token stack, context, and textbuffer to the list.
 */
-static int
-Tokenizer_push(Tokenizer* self, int context)
+static int Tokenizer_push(Tokenizer* self, int context)
 {
     Stack* top = malloc(sizeof(Stack));
     if (!top) {
@@ -256,8 +244,7 @@ Tokenizer_push(Tokenizer* self, int context)
 /*
     Push the textbuffer onto the stack as a Text node and clear it.
 */
-static int
-Tokenizer_push_textbuffer(Tokenizer* self)
+static int Tokenizer_push_textbuffer(Tokenizer* self)
 {
     PyObject *text, *kwargs, *token;
     Textbuffer* buffer = self->topstack->textbuffer;
@@ -292,8 +279,7 @@ Tokenizer_push_textbuffer(Tokenizer* self)
 /*
     Pop and deallocate the top token stack/context/textbuffer.
 */
-static void
-Tokenizer_delete_top_of_stack(Tokenizer* self)
+static void Tokenizer_delete_top_of_stack(Tokenizer* self)
 {
     Stack* top = self->topstack;
     Py_DECREF(top->stack);
@@ -306,8 +292,7 @@ Tokenizer_delete_top_of_stack(Tokenizer* self)
 /*
     Pop the current stack/context/textbuffer, returing the stack.
 */
-static PyObject*
-Tokenizer_pop(Tokenizer* self)
+static PyObject* Tokenizer_pop(Tokenizer* self)
 {
     PyObject* stack;
     if (Tokenizer_push_textbuffer(self))
@@ -322,8 +307,7 @@ Tokenizer_pop(Tokenizer* self)
     Pop the current stack/context/textbuffer, returing the stack. We will also
     replace the underlying stack's context with the current stack's.
 */
-static PyObject*
-Tokenizer_pop_keeping_context(Tokenizer* self)
+static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
 {
     PyObject* stack;
     int context;
@@ -341,8 +325,7 @@ Tokenizer_pop_keeping_context(Tokenizer* self)
     Fail the current tokenization route. Discards the current
     stack/context/textbuffer and raises a BadRoute exception.
 */
-static void*
-Tokenizer_fail_route(Tokenizer* self)
+static void* Tokenizer_fail_route(Tokenizer* self)
 {
     PyObject* stack = Tokenizer_pop(self);
     Py_XDECREF(stack);
@@ -353,8 +336,7 @@ Tokenizer_fail_route(Tokenizer* self)
 /*
     Write a token to the end of the current token stack.
 */
-static int
-Tokenizer_emit(Tokenizer* self, PyObject* token)
+static int Tokenizer_emit(Tokenizer* self, PyObject* token)
 {
     if (Tokenizer_push_textbuffer(self))
         return -1;
@@ -366,8 +348,7 @@ Tokenizer_emit(Tokenizer* self, PyObject* token)
 /*
     Write a token to the beginning of the current token stack.
 */
-static int
-Tokenizer_emit_first(Tokenizer* self, PyObject* token)
+static int Tokenizer_emit_first(Tokenizer* self, PyObject* token)
 {
     if (Tokenizer_push_textbuffer(self))
         return -1;
@@ -379,8 +360,7 @@ Tokenizer_emit_first(Tokenizer* self, PyObject* token)
 /*
     Write text to the current textbuffer.
 */
-static int
-Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
+static int Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
 {
     return Textbuffer_write(&(self->topstack->textbuffer), text);
 }
@@ -388,8 +368,7 @@ Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
 /*
     Write a series of tokens to the current stack at once.
 */
-static int
-Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
+static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
 {
     int pushed = 0;
     PyObject *stack, *token, *left, *right, *text;
@@ -447,8 +426,7 @@ Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
     Pop the current stack, write text, and then write the stack. 'text' is a
     NULL-terminated array of chars.
 */
-static int
-Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
+static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
 {
     PyObject* stack = Tokenizer_pop(self);
     int i = 0;
@@ -477,8 +455,7 @@ Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
 /*
     Read the value at a relative point in the wikicode, forwards.
 */
-static PyObject*
-Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
+static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
 {
     Py_ssize_t index = self->head + delta;
     if (index >= self->length)
@@ -489,8 +466,7 @@ Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
 /*
     Read the value at a relative point in the wikicode, backwards.
 */
-static PyObject*
-Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
+static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
 {
     Py_ssize_t index;
     if (delta > self->head)
@@ -500,82 +476,9 @@ Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
 }
 
 /*
-    Parse a template or argument at the head of the wikicode string.
-*/
-static int
-Tokenizer_parse_template_or_argument(Tokenizer* self)
-{
-    unsigned int braces = 2, i;
-    PyObject *tokenlist;
-
-    self->head += 2;
-    while (Tokenizer_READ(self, 0) == *"{" && braces < MAX_BRACES) {
-        self->head++;
-        braces++;
-    }
-    if (Tokenizer_push(self, 0))
-        return -1;
-    while (braces) {
-        if (braces == 1) {
-            if (Tokenizer_emit_text_then_stack(self, "{"))
-                return -1;
-            return 0;
-        }
-        if (braces == 2) {
-            if (Tokenizer_parse_template(self))
-                return -1;
-            if (BAD_ROUTE) {
-                RESET_ROUTE();
-                if (Tokenizer_emit_text_then_stack(self, "{{"))
-                    return -1;
-                return 0;
-            }
-            break;
-        }
-        if (Tokenizer_parse_argument(self))
-            return -1;
-        if (BAD_ROUTE) {
-            RESET_ROUTE();
-            if (Tokenizer_parse_template(self))
-                return -1;
-            if (BAD_ROUTE) {
-                char text[MAX_BRACES + 1];
-                RESET_ROUTE();
-                for (i = 0; i < braces; i++) text[i] = *"{";
-                text[braces] = *"";
-                if (Tokenizer_emit_text_then_stack(self, text)) {
-                    Py_XDECREF(text);
-                    return -1;
-                }
-                Py_XDECREF(text);
-                return 0;
-            }
-            else
-                braces -= 2;
-        }
-        else
-            braces -= 3;
-        if (braces)
-            self->head++;
-    }
-    tokenlist = Tokenizer_pop(self);
-    if (!tokenlist)
-        return -1;
-    if (Tokenizer_emit_all(self, tokenlist)) {
-        Py_DECREF(tokenlist);
-        return -1;
-    }
-    Py_DECREF(tokenlist);
-    if (self->topstack->context & LC_FAIL_NEXT)
-        self->topstack->context ^= LC_FAIL_NEXT;
-    return 0;
-}
-
-/*
     Parse a template at the head of the wikicode string.
 */
-static int
-Tokenizer_parse_template(Tokenizer* self)
+static int Tokenizer_parse_template(Tokenizer* self)
 {
     PyObject *template, *token;
     Py_ssize_t reset = self->head;
@@ -617,8 +520,7 @@ Tokenizer_parse_template(Tokenizer* self)
 /*
     Parse an argument at the head of the wikicode string.
 */
-static int
-Tokenizer_parse_argument(Tokenizer* self)
+static int Tokenizer_parse_argument(Tokenizer* self)
 {
     PyObject *argument, *token;
     Py_ssize_t reset = self->head;
@@ -658,10 +560,80 @@ Tokenizer_parse_argument(Tokenizer* self)
 }
 
 /*
+    Parse a template or argument at the head of the wikicode string.
+*/
+static int Tokenizer_parse_template_or_argument(Tokenizer* self)
+{
+    unsigned int braces = 2, i;
+    PyObject *tokenlist;
+
+    self->head += 2;
+    while (Tokenizer_READ(self, 0) == *"{" && braces < MAX_BRACES) {
+        self->head++;
+        braces++;
+    }
+    if (Tokenizer_push(self, 0))
+        return -1;
+    while (braces) {
+        if (braces == 1) {
+            if (Tokenizer_emit_text_then_stack(self, "{"))
+                return -1;
+            return 0;
+        }
+        if (braces == 2) {
+            if (Tokenizer_parse_template(self))
+                return -1;
+            if (BAD_ROUTE) {
+                RESET_ROUTE();
+                if (Tokenizer_emit_text_then_stack(self, "{{"))
+                    return -1;
+                return 0;
+            }
+            break;
+        }
+        if (Tokenizer_parse_argument(self))
+            return -1;
+        if (BAD_ROUTE) {
+            RESET_ROUTE();
+            if (Tokenizer_parse_template(self))
+                return -1;
+            if (BAD_ROUTE) {
+                char text[MAX_BRACES + 1];
+                RESET_ROUTE();
+                for (i = 0; i < braces; i++) text[i] = *"{";
+                text[braces] = *"";
+                if (Tokenizer_emit_text_then_stack(self, text)) {
+                    Py_XDECREF(text);
+                    return -1;
+                }
+                Py_XDECREF(text);
+                return 0;
+            }
+            else
+                braces -= 2;
+        }
+        else
+            braces -= 3;
+        if (braces)
+            self->head++;
+    }
+    tokenlist = Tokenizer_pop(self);
+    if (!tokenlist)
+        return -1;
+    if (Tokenizer_emit_all(self, tokenlist)) {
+        Py_DECREF(tokenlist);
+        return -1;
+    }
+    Py_DECREF(tokenlist);
+    if (self->topstack->context & LC_FAIL_NEXT)
+        self->topstack->context ^= LC_FAIL_NEXT;
+    return 0;
+}
+
+/*
     Handle a template parameter at the head of the string.
 */
-static int
-Tokenizer_handle_template_param(Tokenizer* self)
+static int Tokenizer_handle_template_param(Tokenizer* self)
 {
     PyObject *stack, *token;
 
@@ -698,8 +670,7 @@ Tokenizer_handle_template_param(Tokenizer* self)
 /*
     Handle a template parameter's value at the head of the string.
 */
-static int
-Tokenizer_handle_template_param_value(Tokenizer* self)
+static int Tokenizer_handle_template_param_value(Tokenizer* self)
 {
     PyObject *stack, *token;
 
@@ -727,8 +698,7 @@ Tokenizer_handle_template_param_value(Tokenizer* self)
 /*
     Handle the end of a template at the head of the string.
 */
-static PyObject*
-Tokenizer_handle_template_end(Tokenizer* self)
+static PyObject* Tokenizer_handle_template_end(Tokenizer* self)
 {
     PyObject* stack;
 
@@ -750,8 +720,7 @@ Tokenizer_handle_template_end(Tokenizer* self)
 /*
     Handle the separator between an argument's name and default.
 */
-static int
-Tokenizer_handle_argument_separator(Tokenizer* self)
+static int Tokenizer_handle_argument_separator(Tokenizer* self)
 {
     PyObject* token;
     self->topstack->context ^= LC_ARGUMENT_NAME;
@@ -770,8 +739,7 @@ Tokenizer_handle_argument_separator(Tokenizer* self)
 /*
     Handle the end of an argument at the head of the string.
 */
-static PyObject*
-Tokenizer_handle_argument_end(Tokenizer* self)
+static PyObject* Tokenizer_handle_argument_end(Tokenizer* self)
 {
     PyObject* stack = Tokenizer_pop(self);
     self->head += 2;
@@ -781,8 +749,7 @@ Tokenizer_handle_argument_end(Tokenizer* self)
 /*
     Parse an internal wikilink at the head of the wikicode string.
 */
-static int
-Tokenizer_parse_wikilink(Tokenizer* self)
+static int Tokenizer_parse_wikilink(Tokenizer* self)
 {
     Py_ssize_t reset;
     PyObject *wikilink, *token;
@@ -834,8 +801,7 @@ Tokenizer_parse_wikilink(Tokenizer* self)
 /*
     Handle the separator between a wikilink's title and its text.
 */
-static int
-Tokenizer_handle_wikilink_separator(Tokenizer* self)
+static int Tokenizer_handle_wikilink_separator(Tokenizer* self)
 {
     PyObject* token;
     self->topstack->context ^= LC_WIKILINK_TITLE;
@@ -854,8 +820,7 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self)
 /*
     Handle the end of a wikilink at the head of the string.
 */
-static PyObject*
-Tokenizer_handle_wikilink_end(Tokenizer* self)
+static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self)
 {
     PyObject* stack = Tokenizer_pop(self);
     self->head += 1;
@@ -865,8 +830,7 @@ Tokenizer_handle_wikilink_end(Tokenizer* self)
 /*
     Parse a section heading at the head of the wikicode string.
 */
-static int
-Tokenizer_parse_heading(Tokenizer* self)
+static int Tokenizer_parse_heading(Tokenizer* self)
 {
     Py_ssize_t reset = self->head;
     int best = 1, i, context, diff;
@@ -952,8 +916,7 @@ Tokenizer_parse_heading(Tokenizer* self)
 /*
     Handle the end of a section heading at the head of the string.
 */
-static HeadingData*
-Tokenizer_handle_heading_end(Tokenizer* self)
+static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
 {
     Py_ssize_t reset = self->head, best;
     int i, current, level, diff;
@@ -1014,8 +977,7 @@ Tokenizer_handle_heading_end(Tokenizer* self)
 /*
     Actually parse an HTML entity and ensure that it is valid.
 */
-static int
-Tokenizer_really_parse_entity(Tokenizer* self)
+static int Tokenizer_really_parse_entity(Tokenizer* self)
 {
     PyObject *token, *kwargs, *textobj;
     Py_UNICODE this;
@@ -1188,8 +1150,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
 /*
     Parse an HTML entity at the head of the wikicode string.
 */
-static int
-Tokenizer_parse_entity(Tokenizer* self)
+static int Tokenizer_parse_entity(Tokenizer* self)
 {
     Py_ssize_t reset = self->head;
     PyObject *tokenlist;
@@ -1219,8 +1180,7 @@ Tokenizer_parse_entity(Tokenizer* self)
 /*
     Parse an HTML comment at the head of the wikicode string.
 */
-static int
-Tokenizer_parse_comment(Tokenizer* self)
+static int Tokenizer_parse_comment(Tokenizer* self)
 {
     Py_ssize_t reset = self->head + 3;
     PyObject *token, *comment;
@@ -1275,180 +1235,123 @@ Tokenizer_parse_comment(Tokenizer* self)
 }
 
 /*
-    Parse an HTML tag at the head of the wikicode string.
+    Write a pending tag attribute from data to the stack.
 */
-static int
-Tokenizer_parse_tag(Tokenizer* self)
+static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
 {
-    Py_ssize_t reset = self->head;
-    PyObject* tag;
+    PyObject *token, *tokens, *kwargs, *pad_first, *pad_before_eq,
+             *pad_after_eq;
 
-    self->head++;
-    tag = Tokenizer_really_parse_tag(self);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        return Tokenizer_emit_text(self, *"<");
+    if (data->context & TAG_QUOTED) {
+        token = PyObject_CallObject(TagAttrQuote, NULL);
+        if (!token)
+            return -1;
+        if (Tokenizer_emit_first(self, token)) {
+            Py_DECREF(token);
+            return -1;
+        }
+        Py_DECREF(token);
+        tokens = Tokenizer_pop(self);
+        if (!tokens)
+            return -1;
+        if (Tokenizer_emit_all(self, tokens)) {
+            Py_DECREF(tokens);
+            return -1;
+        }
+        Py_DECREF(tokens);
     }
-    if (!tag) {
+    pad_first = Textbuffer_render(data->pad_first);
+    pad_before_eq = Textbuffer_render(data->pad_before_eq);
+    pad_after_eq = Textbuffer_render(data->pad_after_eq);
+    if (!pad_first || !pad_before_eq || !pad_after_eq)
+        return -1;
+    kwargs = PyDict_New();
+    if (!kwargs)
+        return -1;
+    PyDict_SetItemString(kwargs, "pad_first", pad_first);
+    PyDict_SetItemString(kwargs, "pad_before_eq", pad_before_eq);
+    PyDict_SetItemString(kwargs, "pad_after_eq", pad_after_eq);
+    Py_DECREF(pad_first);
+    Py_DECREF(pad_before_eq);
+    Py_DECREF(pad_after_eq);
+    token = PyObject_Call(TagAttrStart, NOARGS, kwargs);
+    Py_DECREF(kwargs);
+    if (!token)
+        return -1;
+    if (Tokenizer_emit_first(self, token)) {
+        Py_DECREF(token);
         return -1;
     }
-    Tokenizer_emit_all(self, tag);
-    Py_DECREF(tag);
+    Py_DECREF(token);
+    tokens = Tokenizer_pop(self);
+    if (!tokens)
+        return -1;
+    if (Tokenizer_emit_all(self, tokens)) {
+        Py_DECREF(tokens);
+        return -1;
+    }
+    Py_DECREF(tokens);
+    if (TagData_reset_buffers(data))
+        return -1;
     return 0;
 }
 
 /*
-    Actually parse an HTML tag, starting with the open (<foo>).
+    Handle whitespace inside of an HTML open tag.
 */
-static PyObject*
-Tokenizer_really_parse_tag(Tokenizer* self)
+static int
+Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
 {
-    TagData *data = TagData_new();
-    PyObject *token, *text, *trash;
-    Py_UNICODE this, next;
-    int can_exit;
+    int ctx = data->context;
+    int end_of_value = (ctx & TAG_ATTR_VALUE &&
+                        !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
 
-    if (!data)
-        return NULL;
-    if (Tokenizer_push(self, LC_TAG_OPEN)) {
-        TagData_dealloc(data);
-        return NULL;
-    }
-    token = PyObject_CallObject(TagOpenOpen, NULL);
-    if (!token) {
-        TagData_dealloc(data);
-        return NULL;
+    if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
+        if (Tokenizer_push_tag_buffer(self, data))
+            return -1;
+        data->context = TAG_ATTR_READY;
     }
-    if (Tokenizer_emit(self, token)) {
-        Py_DECREF(token);
-        TagData_dealloc(data);
-        return NULL;
+    else if (ctx & TAG_NOTE_SPACE)
+        data->context = TAG_ATTR_READY;
+    else if (ctx & TAG_ATTR_NAME) {
+        data->context |= TAG_NOTE_EQUALS;
+        if (Textbuffer_write(&(data->pad_before_eq), text))
+            return -1;
     }
-    Py_DECREF(token);
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        next = Tokenizer_READ(self, 1);
-        can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
-                    data->context & TAG_NOTE_SPACE);
-        if (this == *"") {
-            if (self->topstack->context & LC_TAG_ATTR) {
-                if (data->context & TAG_QUOTED) {
-                    // Unclosed attribute quote: reset, don't die
-                    data->context = TAG_ATTR_VALUE;
-                    trash = Tokenizer_pop(self);
-                    Py_XDECREF(trash);
-                    self->head = data->reset;
-                    continue;
-                }
-                trash = Tokenizer_pop(self);
-                Py_XDECREF(trash);
-            }
-            TagData_dealloc(data);
-            return Tokenizer_fail_route(self);
-        }
-        else if (this == *">" && can_exit) {
-            if (Tokenizer_handle_tag_close_open(self, data, TagCloseOpen)) {
-                TagData_dealloc(data);
-                return NULL;
-            }
-            TagData_dealloc(data);
-            self->topstack->context = LC_TAG_BODY;
-            token = PyList_GET_ITEM(self->topstack->stack, 1);
-            text = PyObject_GetAttrString(token, "text");
-            if (!text)
-                return NULL;
-            if (IS_SINGLE_ONLY(text)) {
-                Py_DECREF(text);
-                return Tokenizer_handle_single_only_tag_end(self);
-            }
-            if (IS_PARSABLE(text)) {
-                Py_DECREF(text);
-                return Tokenizer_parse(self, 0, 0);
-            }
-            Py_DECREF(text);
-            return Tokenizer_handle_blacklisted_tag(self);
-        }
-        else if (this == *"/" && next == *">" && can_exit) {
-            if (Tokenizer_handle_tag_close_open(self, data,
-                                                TagCloseSelfclose)) {
-                TagData_dealloc(data);
-                return NULL;
-            }
-            TagData_dealloc(data);
-            return Tokenizer_pop(self);
-        }
-        else {
-            if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
-                TagData_dealloc(data);
-                return NULL;
-            }
-        }
-        self->head++;
+    if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
+        if (Tokenizer_emit_text(self, text))
+            return -1;
     }
+    else if (data->context & TAG_ATTR_READY)
+        return Textbuffer_write(&(data->pad_first), text);
+    else if (data->context & TAG_ATTR_VALUE)
+        return Textbuffer_write(&(data->pad_after_eq), text);
+    return 0;
 }
 
 /*
-    Write a pending tag attribute from data to the stack.
+    Handle regular text inside of an HTML open tag.
 */
-static int
-Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
+static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
 {
-    PyObject *token, *tokens, *kwargs, *pad_first, *pad_before_eq,
-             *pad_after_eq;
+    Py_UNICODE next = Tokenizer_READ(self, 1);
+    int i, is_marker = 0;
 
-    if (data->context & TAG_QUOTED) {
-        token = PyObject_CallObject(TagAttrQuote, NULL);
-        if (!token)
-            return -1;
-        if (Tokenizer_emit_first(self, token)) {
-            Py_DECREF(token);
-            return -1;
-        }
-        Py_DECREF(token);
-        tokens = Tokenizer_pop(self);
-        if (!tokens)
-            return -1;
-        if (Tokenizer_emit_all(self, tokens)) {
-            Py_DECREF(tokens);
-            return -1;
+    for (i = 0; i < NUM_MARKERS; i++) {
+        if (*MARKERS[i] == text) {
+            is_marker = 1;
+            break;
         }
-        Py_DECREF(tokens);
-    }
-    pad_first = Textbuffer_render(data->pad_first);
-    pad_before_eq = Textbuffer_render(data->pad_before_eq);
-    pad_after_eq = Textbuffer_render(data->pad_after_eq);
-    if (!pad_first || !pad_before_eq || !pad_after_eq)
-        return -1;
-    kwargs = PyDict_New();
-    if (!kwargs)
-        return -1;
-    PyDict_SetItemString(kwargs, "pad_first", pad_first);
-    PyDict_SetItemString(kwargs, "pad_before_eq", pad_before_eq);
-    PyDict_SetItemString(kwargs, "pad_after_eq", pad_after_eq);
-    Py_DECREF(pad_first);
-    Py_DECREF(pad_before_eq);
-    Py_DECREF(pad_after_eq);
-    token = PyObject_Call(TagAttrStart, NOARGS, kwargs);
-    Py_DECREF(kwargs);
-    if (!token)
-        return -1;
-    if (Tokenizer_emit_first(self, token)) {
-        Py_DECREF(token);
-        return -1;
-    }
-    Py_DECREF(token);
-    tokens = Tokenizer_pop(self);
-    if (!tokens)
-        return -1;
-    if (Tokenizer_emit_all(self, tokens)) {
-        Py_DECREF(tokens);
-        return -1;
     }
-    Py_DECREF(tokens);
-    if (TagData_reset_buffers(data))
-        return -1;
-    return 0;
+    if (!is_marker || !Tokenizer_CAN_RECURSE(self))
+        return Tokenizer_emit_text(self, text);
+    else if (text == next && next == *"{")
+        return Tokenizer_parse_template_or_argument(self);
+    else if (text == next && next == *"[")
+        return Tokenizer_parse_wikilink(self);
+    else if (text == *"<")
+        return Tokenizer_parse_tag(self);
+    return Tokenizer_emit_text(self, text);
 }
 
 /*
@@ -1543,89 +1446,6 @@ Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
 }
 
 /*
-    Handle whitespace inside of an HTML open tag.
-*/
-static int
-Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
-{
-    int ctx = data->context;
-    int end_of_value = (ctx & TAG_ATTR_VALUE &&
-                        !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
-
-    if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
-        if (Tokenizer_push_tag_buffer(self, data))
-            return -1;
-        data->context = TAG_ATTR_READY;
-    }
-    else if (ctx & TAG_NOTE_SPACE)
-        data->context = TAG_ATTR_READY;
-    else if (ctx & TAG_ATTR_NAME) {
-        data->context |= TAG_NOTE_EQUALS;
-        if (Textbuffer_write(&(data->pad_before_eq), text))
-            return -1;
-    }
-    if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
-        if (Tokenizer_emit_text(self, text))
-            return -1;
-    }
-    else if (data->context & TAG_ATTR_READY)
-        return Textbuffer_write(&(data->pad_first), text);
-    else if (data->context & TAG_ATTR_VALUE)
-        return Textbuffer_write(&(data->pad_after_eq), text);
-    return 0;
-}
-
-/*
-    Handle regular text inside of an HTML open tag.
-*/
-static int
-Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
-{
-    Py_UNICODE next = Tokenizer_READ(self, 1);
-    int i, is_marker = 0;
-
-    for (i = 0; i < NUM_MARKERS; i++) {
-        if (*MARKERS[i] == text) {
-            is_marker = 1;
-            break;
-        }
-    }
-    if (!is_marker || !Tokenizer_CAN_RECURSE(self))
-        return Tokenizer_emit_text(self, text);
-    else if (text == next && next == *"{")
-        return Tokenizer_parse_template_or_argument(self);
-    else if (text == next && next == *"[")
-        return Tokenizer_parse_wikilink(self);
-    else if (text == *"<")
-        return Tokenizer_parse_tag(self);
-    return Tokenizer_emit_text(self, text);
-}
-
-/*
-    Handle the body of an HTML tag that is parser-blacklisted.
-*/
-static PyObject*
-Tokenizer_handle_blacklisted_tag(Tokenizer* self)
-{
-    Py_UNICODE this, next;
-
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        next = Tokenizer_READ(self, 1);
-        self->head++;
-        if (this == *"")
-            return Tokenizer_fail_route(self);
-        else if (this == *"<" && next == *"/") {
-            if (Tokenizer_handle_tag_open_close(self))
-                return NULL;
-            return Tokenizer_parse(self, 0, 0);
-        }
-        if (Tokenizer_emit_text(self, this))
-            return NULL;
-    }
-}
-
-/*
     Handle the closing of a open tag (<foo>).
 */
 static int
@@ -1663,8 +1483,7 @@ Tokenizer_handle_tag_close_open(Tokenizer* self, TagData* data, PyObject* cls)
 /*
     Handle the opening of a closing tag (</foo>).
 */
-static int
-Tokenizer_handle_tag_open_close(Tokenizer* self)
+static int Tokenizer_handle_tag_open_close(Tokenizer* self)
 {
     PyObject* token;
 
@@ -1685,8 +1504,7 @@ Tokenizer_handle_tag_open_close(Tokenizer* self)
 /*
     Handle the ending of a closing tag (</foo>).
 */
-static PyObject*
-Tokenizer_handle_tag_close_close(Tokenizer* self)
+static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self)
 {
     PyObject *closing, *first, *so, *sc, *token;
     int valid = 1;
@@ -1741,65 +1559,32 @@ Tokenizer_handle_tag_close_close(Tokenizer* self)
 }
 
 /*
-    Handle the (possible) start of an implicitly closing single tag.
+    Handle the body of an HTML tag that is parser-blacklisted.
 */
-static int
-Tokenizer_handle_invalid_tag_start(Tokenizer* self)
+static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
 {
-    Py_ssize_t reset = self->head + 1, pos = 0;
-    Textbuffer* buf;
-    PyObject *name, *tag;
-    Py_UNICODE this;
-    int is_marker, i;
+    Py_UNICODE this, next;
 
-    self->head += 2;
-    buf = Textbuffer_new();
-    if (!buf)
-        return -1;
     while (1) {
-        this = Tokenizer_READ(self, pos);
-        is_marker = 0;
-        for (i = 0; i < NUM_MARKERS; i++) {
-            if (*MARKERS[i] == this) {
-                is_marker = 1;
-                break;
-            }
-        }
-        if (is_marker) {
-            name = Textbuffer_render(buf);
-            if (!name) {
-                Textbuffer_dealloc(buf);
-                return -1;
-            }
-            if (!IS_SINGLE_ONLY(name))
-                FAIL_ROUTE();
-            break;
+        this = Tokenizer_READ(self, 0);
+        next = Tokenizer_READ(self, 1);
+        self->head++;
+        if (this == *"")
+            return Tokenizer_fail_route(self);
+        else if (this == *"<" && next == *"/") {
+            if (Tokenizer_handle_tag_open_close(self))
+                return NULL;
+            return Tokenizer_parse(self, 0, 0);
         }
-        Textbuffer_write(&buf, this);
-        pos++;
-    }
-    if (!BAD_ROUTE) {
-        tag = Tokenizer_really_parse_tag(self);
-        if (!tag)
-            return -1;
-    }
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        return (Tokenizer_emit_text(self, *"<") ||
-                Tokenizer_emit_text(self, *"/"));
+        if (Tokenizer_emit_text(self, this))
+            return NULL;
     }
-    // Set invalid=True flag of TagOpenOpen
-    if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
-        return -1;
-    return Tokenizer_emit_all(self, tag);
 }
 
 /*
     Handle the end of an implicitly closing single-only HTML tag.
 */
-static PyObject*
-Tokenizer_handle_single_only_tag_end(Tokenizer* self)
+static PyObject* Tokenizer_handle_single_only_tag_end(Tokenizer* self)
 {
     PyObject *top, *padding, *kwargs, *token;
 
@@ -1834,8 +1619,7 @@ Tokenizer_handle_single_only_tag_end(Tokenizer* self)
 /*
     Handle the stream end when inside a single-supporting HTML tag.
 */
-static PyObject*
-Tokenizer_handle_single_tag_end(Tokenizer* self)
+static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self)
 {
     PyObject *token = 0, *padding, *kwargs;
     Py_ssize_t len, index;
@@ -1875,10 +1659,175 @@ Tokenizer_handle_single_tag_end(Tokenizer* self)
 }
 
 /*
+    Actually parse an HTML tag, starting with the open (<foo>).
+*/
+static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
+{
+    TagData *data = TagData_new();
+    PyObject *token, *text, *trash;
+    Py_UNICODE this, next;
+    int can_exit;
+
+    if (!data)
+        return NULL;
+    if (Tokenizer_push(self, LC_TAG_OPEN)) {
+        TagData_dealloc(data);
+        return NULL;
+    }
+    token = PyObject_CallObject(TagOpenOpen, NULL);
+    if (!token) {
+        TagData_dealloc(data);
+        return NULL;
+    }
+    if (Tokenizer_emit(self, token)) {
+        Py_DECREF(token);
+        TagData_dealloc(data);
+        return NULL;
+    }
+    Py_DECREF(token);
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        next = Tokenizer_READ(self, 1);
+        can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
+                    data->context & TAG_NOTE_SPACE);
+        if (this == *"") {
+            if (self->topstack->context & LC_TAG_ATTR) {
+                if (data->context & TAG_QUOTED) {
+                    // Unclosed attribute quote: reset, don't die
+                    data->context = TAG_ATTR_VALUE;
+                    trash = Tokenizer_pop(self);
+                    Py_XDECREF(trash);
+                    self->head = data->reset;
+                    continue;
+                }
+                trash = Tokenizer_pop(self);
+                Py_XDECREF(trash);
+            }
+            TagData_dealloc(data);
+            return Tokenizer_fail_route(self);
+        }
+        else if (this == *">" && can_exit) {
+            if (Tokenizer_handle_tag_close_open(self, data, TagCloseOpen)) {
+                TagData_dealloc(data);
+                return NULL;
+            }
+            TagData_dealloc(data);
+            self->topstack->context = LC_TAG_BODY;
+            token = PyList_GET_ITEM(self->topstack->stack, 1);
+            text = PyObject_GetAttrString(token, "text");
+            if (!text)
+                return NULL;
+            if (IS_SINGLE_ONLY(text)) {
+                Py_DECREF(text);
+                return Tokenizer_handle_single_only_tag_end(self);
+            }
+            if (IS_PARSABLE(text)) {
+                Py_DECREF(text);
+                return Tokenizer_parse(self, 0, 0);
+            }
+            Py_DECREF(text);
+            return Tokenizer_handle_blacklisted_tag(self);
+        }
+        else if (this == *"/" && next == *">" && can_exit) {
+            if (Tokenizer_handle_tag_close_open(self, data,
+                                                TagCloseSelfclose)) {
+                TagData_dealloc(data);
+                return NULL;
+            }
+            TagData_dealloc(data);
+            return Tokenizer_pop(self);
+        }
+        else {
+            if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
+                TagData_dealloc(data);
+                return NULL;
+            }
+        }
+        self->head++;
+    }
+}
+
+/*
+    Handle the (possible) start of an implicitly closing single tag.
+*/
+static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head + 1, pos = 0;
+    Textbuffer* buf;
+    PyObject *name, *tag;
+    Py_UNICODE this;
+    int is_marker, i;
+
+    self->head += 2;
+    buf = Textbuffer_new();
+    if (!buf)
+        return -1;
+    while (1) {
+        this = Tokenizer_READ(self, pos);
+        is_marker = 0;
+        for (i = 0; i < NUM_MARKERS; i++) {
+            if (*MARKERS[i] == this) {
+                is_marker = 1;
+                break;
+            }
+        }
+        if (is_marker) {
+            name = Textbuffer_render(buf);
+            if (!name) {
+                Textbuffer_dealloc(buf);
+                return -1;
+            }
+            if (!IS_SINGLE_ONLY(name))
+                FAIL_ROUTE();
+            break;
+        }
+        Textbuffer_write(&buf, this);
+        pos++;
+    }
+    if (!BAD_ROUTE) {
+        tag = Tokenizer_really_parse_tag(self);
+        if (!tag)
+            return -1;
+    }
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        return (Tokenizer_emit_text(self, *"<") ||
+                Tokenizer_emit_text(self, *"/"));
+    }
+    // Set invalid=True flag of TagOpenOpen
+    if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
+        return -1;
+    return Tokenizer_emit_all(self, tag);
+}
+
+/*
+    Parse an HTML tag at the head of the wikicode string.
+*/
+static int Tokenizer_parse_tag(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head;
+    PyObject* tag;
+
+    self->head++;
+    tag = Tokenizer_really_parse_tag(self);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        return Tokenizer_emit_text(self, *"<");
+    }
+    if (!tag) {
+        return -1;
+    }
+    Tokenizer_emit_all(self, tag);
+    Py_DECREF(tag);
+    return 0;
+}
+
+/*
     Handle the end of the stream of wikitext.
 */
-static PyObject*
-Tokenizer_handle_end(Tokenizer* self, int context)
+static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
 {
     static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
                                 LC_HEADING | LC_COMMENT | LC_TAG);
@@ -1910,8 +1859,7 @@ Tokenizer_handle_end(Tokenizer* self, int context)
     Make sure we are not trying to write an invalid character. Return 0 if
     everything is safe, or -1 if the route must be failed.
 */
-static int
-Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
+static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
 {
     if (context & LC_FAIL_NEXT) {
         return -1;
@@ -1990,8 +1938,7 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
     Parse the wikicode string, using context for when to stop. If push is true,
     we will push a new context, otherwise we won't and context will be ignored.
 */
-static PyObject*
-Tokenizer_parse(Tokenizer* self, int context, int push)
+static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
 {
     static int unsafe_contexts = (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE |
                                   LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME);
@@ -2141,8 +2088,7 @@ Tokenizer_parse(Tokenizer* self, int context, int push)
 /*
     Build a list of tokens from a string of wikicode and return it.
 */
-static PyObject*
-Tokenizer_tokenize(Tokenizer* self, PyObject* args)
+static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
 {
     PyObject *text, *temp;
 
@@ -2169,8 +2115,7 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
     return Tokenizer_parse(self, 0, 1);
 }
 
-static void
-load_entitydefs(void)
+static void load_entitydefs(void)
 {
     PyObject *tempmod, *defmap, *deflist;
     unsigned numdefs, i;
@@ -2193,8 +2138,7 @@ load_entitydefs(void)
     Py_DECREF(deflist);
 }
 
-static void
-load_tokens(void)
+static void load_tokens(void)
 {
     PyObject *tempmod, *tokens,
              *globals = PyEval_GetGlobals(),
@@ -2253,8 +2197,7 @@ load_tokens(void)
     Py_DECREF(tokens);
 }
 
-static void
-load_tag_defs(void)
+static void load_tag_defs(void)
 {
     PyObject *tempmod,
              *globals = PyEval_GetGlobals(),
@@ -2274,8 +2217,7 @@ load_tag_defs(void)
     Py_DECREF(tempmod);
 }
 
-PyMODINIT_FUNC
-init_tokenizer(void)
+PyMODINIT_FUNC init_tokenizer(void)
 {
     PyObject *module;
 
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 20934fa..f20395b 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -220,60 +220,14 @@ typedef struct {
 
 static Textbuffer* Textbuffer_new(void);
 static void Textbuffer_dealloc(Textbuffer*);
-static int Textbuffer_write(Textbuffer**, Py_UNICODE);
-static PyObject* Textbuffer_render(Textbuffer*);
 
 static TagData* TagData_new(void);
 static void TagData_dealloc(TagData*);
-static int TagData_reset_buffers(TagData*);
 
 static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static void Tokenizer_dealloc(Tokenizer*);
 static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
-static int Tokenizer_push(Tokenizer*, int);
-static int Tokenizer_push_textbuffer(Tokenizer*);
-static void Tokenizer_delete_top_of_stack(Tokenizer*);
-static PyObject* Tokenizer_pop(Tokenizer*);
-static PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
-static void* Tokenizer_fail_route(Tokenizer*);
-static int Tokenizer_emit(Tokenizer*, PyObject*);
-static int Tokenizer_emit_first(Tokenizer*, PyObject*);
-static int Tokenizer_emit_text(Tokenizer*, Py_UNICODE);
-static int Tokenizer_emit_all(Tokenizer*, PyObject*);
-static int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);
-static PyObject* Tokenizer_read(Tokenizer*, Py_ssize_t);
-static PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
-static int Tokenizer_parse_template_or_argument(Tokenizer*);
-static int Tokenizer_parse_template(Tokenizer*);
-static int Tokenizer_parse_argument(Tokenizer*);
-static int Tokenizer_handle_template_param(Tokenizer*);
-static int Tokenizer_handle_template_param_value(Tokenizer*);
-static PyObject* Tokenizer_handle_template_end(Tokenizer*);
-static int Tokenizer_handle_argument_separator(Tokenizer*);
-static PyObject* Tokenizer_handle_argument_end(Tokenizer*);
-static int Tokenizer_parse_wikilink(Tokenizer*);
-static int Tokenizer_handle_wikilink_separator(Tokenizer*);
-static PyObject* Tokenizer_handle_wikilink_end(Tokenizer*);
-static int Tokenizer_parse_heading(Tokenizer*);
-static HeadingData* Tokenizer_handle_heading_end(Tokenizer*);
-static int Tokenizer_really_parse_entity(Tokenizer*);
-static int Tokenizer_parse_entity(Tokenizer*);
-static int Tokenizer_parse_comment(Tokenizer*);
 static int Tokenizer_parse_tag(Tokenizer*);
-static PyObject* Tokenizer_really_parse_tag(Tokenizer*);
-static int Tokenizer_push_tag_buffer(Tokenizer*, TagData*);
-static int Tokenizer_handle_tag_data(Tokenizer*, TagData*, Py_UNICODE);
-static int Tokenizer_handle_tag_space(Tokenizer*, TagData*, Py_UNICODE);
-static int Tokenizer_handle_tag_text(Tokenizer*, Py_UNICODE);
-static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer*);
-static int Tokenizer_handle_tag_close_open(Tokenizer*, TagData*, PyObject*);
-static int Tokenizer_handle_tag_open_close(Tokenizer*);
-static PyObject* Tokenizer_handle_tag_close_close(Tokenizer*);
-static int Tokenizer_handle_invalid_tag_start(Tokenizer*);
-static PyObject* Tokenizer_handle_single_only_tag_end(Tokenizer*);
-static PyObject* Tokenizer_handle_single_tag_end(Tokenizer*);
-static PyObject* Tokenizer_handle_end(Tokenizer*, int);
-static int Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE);
 static PyObject* Tokenizer_parse(Tokenizer*, int, int);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index c53b022..128e408 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -186,6 +186,30 @@ class Tokenizer(object):
                 self._fail_route()
             return self.END
 
+    def _parse_template(self):
+        """Parse a template at the head of the wikicode string."""
+        reset = self._head
+        try:
+            template = self._parse(contexts.TEMPLATE_NAME)
+        except BadRoute:
+            self._head = reset
+            raise
+        self._emit_first(tokens.TemplateOpen())
+        self._emit_all(template)
+        self._emit(tokens.TemplateClose())
+
+    def _parse_argument(self):
+        """Parse an argument at the head of the wikicode string."""
+        reset = self._head
+        try:
+            argument = self._parse(contexts.ARGUMENT_NAME)
+        except BadRoute:
+            self._head = reset
+            raise
+        self._emit_first(tokens.ArgumentOpen())
+        self._emit_all(argument)
+        self._emit(tokens.ArgumentClose())
+
     def _parse_template_or_argument(self):
         """Parse a template or argument at the head of the wikicode string."""
         self._head += 2
@@ -220,30 +244,6 @@ class Tokenizer(object):
         if self._context & contexts.FAIL_NEXT:
             self._context ^= contexts.FAIL_NEXT
 
-    def _parse_template(self):
-        """Parse a template at the head of the wikicode string."""
-        reset = self._head
-        try:
-            template = self._parse(contexts.TEMPLATE_NAME)
-        except BadRoute:
-            self._head = reset
-            raise
-        self._emit_first(tokens.TemplateOpen())
-        self._emit_all(template)
-        self._emit(tokens.TemplateClose())
-
-    def _parse_argument(self):
-        """Parse an argument at the head of the wikicode string."""
-        reset = self._head
-        try:
-            argument = self._parse(contexts.ARGUMENT_NAME)
-        except BadRoute:
-            self._head = reset
-            raise
-        self._emit_first(tokens.ArgumentOpen())
-        self._emit_all(argument)
-        self._emit(tokens.ArgumentClose())
-
     def _handle_template_param(self):
         """Handle a template parameter at the head of the string."""
         if self._context & contexts.TEMPLATE_NAME:
@@ -425,52 +425,6 @@ class Tokenizer(object):
             self._emit(tokens.CommentEnd())
             self._head += 2
 
-    def _parse_tag(self):
-        """Parse an HTML tag at the head of the wikicode string."""
-        reset = self._head
-        self._head += 1
-        try:
-            tag = self._really_parse_tag()
-        except BadRoute:
-            self._head = reset
-            self._emit_text("<")
-        else:
-            self._emit_all(tag)
-
-    def _really_parse_tag(self):
-        """Actually parse an HTML tag, starting with the open (``<foo>``)."""
-        data = _TagOpenData()
-        self._push(contexts.TAG_OPEN)
-        self._emit(tokens.TagOpenOpen())
-        while True:
-            this, next = self._read(), self._read(1)
-            can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
-                        data.context & data.CX_NOTE_SPACE)
-            if this is self.END:
-                if self._context & contexts.TAG_ATTR:
-                    if data.context & data.CX_QUOTED:
-                        # Unclosed attribute quote: reset, don't die
-                        data.context = data.CX_ATTR_VALUE
-                        self._pop()
-                        self._head = data.reset
-                        continue
-                    self._pop()
-                self._fail_route()
-            elif this == ">" and can_exit:
-                self._handle_tag_close_open(data, tokens.TagCloseOpen)
-                self._context = contexts.TAG_BODY
-                if is_single_only(self._stack[1].text):
-                    return self._handle_single_only_tag_end()
-                if is_parsable(self._stack[1].text):
-                    return self._parse(push=False)
-                return self._handle_blacklisted_tag()
-            elif this == "/" and next == ">" and can_exit:
-                self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
-                return self._pop()
-            else:
-                self._handle_tag_data(data, this)
-            self._head += 1
-
     def _push_tag_buffer(self, data):
         """Write a pending tag attribute from *data* to the stack."""
         if data.context & data.CX_QUOTED:
@@ -482,6 +436,39 @@ class Tokenizer(object):
         self._emit_all(self._pop())
         data.padding_buffer = {key: "" for key in data.padding_buffer}
 
+    def _handle_tag_space(self, data, text):
+        """Handle whitespace (*text*) inside of an HTML open tag."""
+        ctx = data.context
+        end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NOTE_QUOTE)
+        if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE):
+            self._push_tag_buffer(data)
+            data.context = data.CX_ATTR_READY
+        elif ctx & data.CX_NOTE_SPACE:
+            data.context = data.CX_ATTR_READY
+        elif ctx & data.CX_ATTR_NAME:
+            data.context |= data.CX_NOTE_EQUALS
+            data.padding_buffer["before_eq"] += text
+        if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
+            self._emit_text(text)
+        elif data.context & data.CX_ATTR_READY:
+            data.padding_buffer["first"] += text
+        elif data.context & data.CX_ATTR_VALUE:
+            data.padding_buffer["after_eq"] += text
+
+    def _handle_tag_text(self, text):
+        """Handle regular *text* inside of an HTML open tag."""
+        next = self._read(1)
+        if not self._can_recurse() or text not in self.MARKERS:
+            self._emit_text(text)
+        elif text == next == "{":
+            self._parse_template_or_argument()
+        elif text == next == "[":
+            self._parse_wikilink()
+        elif text == "<":
+            self._parse_tag()
+        else:
+            self._emit_text(text)
+
     def _handle_tag_data(self, data, text):
         """Handle all sorts of *text* data inside of an HTML open tag."""
         for chunk in self.tag_splitter.split(text):
@@ -528,52 +515,6 @@ class Tokenizer(object):
                         continue
             self._handle_tag_text(chunk)
 
-    def _handle_tag_space(self, data, text):
-        """Handle whitespace (*text*) inside of an HTML open tag."""
-        ctx = data.context
-        end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NOTE_QUOTE)
-        if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE):
-            self._push_tag_buffer(data)
-            data.context = data.CX_ATTR_READY
-        elif ctx & data.CX_NOTE_SPACE:
-            data.context = data.CX_ATTR_READY
-        elif ctx & data.CX_ATTR_NAME:
-            data.context |= data.CX_NOTE_EQUALS
-            data.padding_buffer["before_eq"] += text
-        if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
-            self._emit_text(text)
-        elif data.context & data.CX_ATTR_READY:
-            data.padding_buffer["first"] += text
-        elif data.context & data.CX_ATTR_VALUE:
-            data.padding_buffer["after_eq"] += text
-
-    def _handle_tag_text(self, text):
-        """Handle regular *text* inside of an HTML open tag."""
-        next = self._read(1)
-        if not self._can_recurse() or text not in self.MARKERS:
-            self._emit_text(text)
-        elif text == next == "{":
-            self._parse_template_or_argument()
-        elif text == next == "[":
-            self._parse_wikilink()
-        elif text == "<":
-            self._parse_tag()
-        else:
-            self._emit_text(text)
-
-    def _handle_blacklisted_tag(self):
-        """Handle the body of an HTML tag that is parser-blacklisted."""
-        while True:
-            this, next = self._read(), self._read(1)
-            self._head += 1
-            if this is self.END:
-                self._fail_route()
-            elif this == "<" and next == "/":
-                self._handle_tag_open_close()
-                return self._parse(push=False)
-            else:
-                self._emit_text(this)
-
     def _handle_tag_close_open(self, data, token):
         """Handle the closing of a open tag (``<foo>``)."""
         if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
@@ -598,20 +539,18 @@ class Tokenizer(object):
         self._emit(tokens.TagCloseClose())
         return self._pop()
 
-    def _handle_invalid_tag_start(self):
-        """Handle the (possible) start of an implicitly closing single tag."""
-        reset = self._head + 1
-        self._head += 2
-        try:
-            if not is_single_only(self.tag_splitter.split(self._read())[0]):
-                raise BadRoute()
-            tag = self._really_parse_tag()
-        except BadRoute:
-            self._head = reset
-            self._emit_text("</")
-        else:
-            tag[0].invalid = True  # Set flag of TagOpenOpen
-            self._emit_all(tag)
+    def _handle_blacklisted_tag(self):
+        """Handle the body of an HTML tag that is parser-blacklisted."""
+        while True:
+            this, next = self._read(), self._read(1)
+            self._head += 1
+            if this is self.END:
+                self._fail_route()
+            elif this == "<" and next == "/":
+                self._handle_tag_open_close()
+                return self._parse(push=False)
+            else:
+                self._emit_text(this)
 
     def _handle_single_only_tag_end(self):
         """Handle the end of an implicitly closing single-only HTML tag."""
@@ -629,6 +568,67 @@ class Tokenizer(object):
         self._stack[index] = token
         return self._pop()
 
+    def _really_parse_tag(self):
+        """Actually parse an HTML tag, starting with the open (``<foo>``)."""
+        data = _TagOpenData()
+        self._push(contexts.TAG_OPEN)
+        self._emit(tokens.TagOpenOpen())
+        while True:
+            this, next = self._read(), self._read(1)
+            can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
+                        data.context & data.CX_NOTE_SPACE)
+            if this is self.END:
+                if self._context & contexts.TAG_ATTR:
+                    if data.context & data.CX_QUOTED:
+                        # Unclosed attribute quote: reset, don't die
+                        data.context = data.CX_ATTR_VALUE
+                        self._pop()
+                        self._head = data.reset
+                        continue
+                    self._pop()
+                self._fail_route()
+            elif this == ">" and can_exit:
+                self._handle_tag_close_open(data, tokens.TagCloseOpen)
+                self._context = contexts.TAG_BODY
+                if is_single_only(self._stack[1].text):
+                    return self._handle_single_only_tag_end()
+                if is_parsable(self._stack[1].text):
+                    return self._parse(push=False)
+                return self._handle_blacklisted_tag()
+            elif this == "/" and next == ">" and can_exit:
+                self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
+                return self._pop()
+            else:
+                self._handle_tag_data(data, this)
+            self._head += 1
+
+    def _handle_invalid_tag_start(self):
+        """Handle the (possible) start of an implicitly closing single tag."""
+        reset = self._head + 1
+        self._head += 2
+        try:
+            if not is_single_only(self.tag_splitter.split(self._read())[0]):
+                raise BadRoute()
+            tag = self._really_parse_tag()
+        except BadRoute:
+            self._head = reset
+            self._emit_text("</")
+        else:
+            tag[0].invalid = True  # Set flag of TagOpenOpen
+            self._emit_all(tag)
+
+    def _parse_tag(self):
+        """Parse an HTML tag at the head of the wikicode string."""
+        reset = self._head
+        self._head += 1
+        try:
+            tag = self._really_parse_tag()
+        except BadRoute:
+            self._head = reset
+            self._emit_text("<")
+        else:
+            self._emit_all(tag)
+
     def _handle_end(self):
         """Handle the end of the stream of wikitext."""
         fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |

From 07d49c680a870188a83270ea688191c9d9b43ec7 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 26 Jul 2013 23:50:53 -0400
Subject: [PATCH 76/77] Fix resetting of fields when doing multiple
 tokenizations.

---
 mwparserfromhell/parser/tokenizer.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index c1d05e5..1d289de 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -213,9 +213,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
     self->text = Py_None;
     Py_INCREF(Py_None);
     self->topstack = NULL;
-    self->head = 0;
-    self->length = 0;
-    self->global = 0;
+    self->head = self->length = self->global = self->depth = self->cycles = 0;
     return 0;
 }
 
@@ -2111,6 +2109,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
         Py_XDECREF(self->text);
         self->text = PySequence_Fast(text, "expected a sequence");
     }
+    self->head = self->global = self->depth = self->cycles = 0;
     self->length = PyList_GET_SIZE(self->text);
     return Tokenizer_parse(self, 0, 1);
 }

From 4663563ce46846148968e267159d0de8dcdb9dfd Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 27 Jul 2013 00:02:45 -0400
Subject: [PATCH 77/77] Remove unnecessary markers.

---
 mwparserfromhell/parser/tokenizer.h  | 6 +++---
 mwparserfromhell/parser/tokenizer.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index f20395b..c42f5f9 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -41,10 +41,10 @@ SOFTWARE.
 #define ALPHANUM  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 static const char* MARKERS[] = {
-    "{",  "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/",
-    "\\", "\"", "-", "!", "\n", ""};
+    "{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
+    "\n", ""};
 
-#define NUM_MARKERS 20
+#define NUM_MARKERS 17
 #define TEXTBUFFER_BLOCKSIZE 1024
 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 128e408..50c7fbd 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -58,7 +58,7 @@ class Tokenizer(object):
     START = object()
     END = object()
     MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
-               "/", "\\", '"', "-", "!", "\n", END]
+               "/", "-", "\n", END]
     MAX_DEPTH = 40
     MAX_CYCLES = 100000
     regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)