From 9b98907751c28c48e0a2ff97583c26f371948128 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 13 Aug 2013 20:55:35 -0400
Subject: [PATCH] Add C hooks and prototypes for wiki-markup tags.

---
 mwparserfromhell/parser/tokenizer.c | 227 ++++++++++++++++++++++++++++++++++--
 mwparserfromhell/parser/tokenizer.h |  96 ++++++++-------
 2 files changed, 273 insertions(+), 50 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index bae5ec2..be996ad 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -325,9 +325,10 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
 */
 static void* Tokenizer_fail_route(Tokenizer* self)
 {
+    int context = self->topstack->context;
     PyObject* stack = Tokenizer_pop(self);
     Py_XDECREF(stack);
-    FAIL_ROUTE();
+    FAIL_ROUTE(context);
     return NULL;
 }
 
@@ -1776,7 +1777,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
                 return -1;
             }
             if (!IS_SINGLE_ONLY(name))
-                FAIL_ROUTE();
+                FAIL_ROUTE(0);
             break;
         }
         Textbuffer_write(&buf, this);
@@ -1823,12 +1824,201 @@ static int Tokenizer_parse_tag(Tokenizer* self)
 }
 
 /*
+    Write the body of a tag and the tokens that should surround it.
+*/
+static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks,
+                                    PyObject* body)
+{
+    // self._emit(tokens.TagOpenOpen(wiki_markup=markup))
+    // self._emit_text(tag)
+    // self._emit(tokens.TagCloseOpen())
+    // self._emit_all(body)
+    // self._emit(tokens.TagOpenClose())
+    // self._emit_text(tag)
+    // self._emit(tokens.TagCloseClose())
+}
+
+/*
+    Parse wiki-style italics.
+*/
+static int Tokenizer_parse_italics(Tokenizer* self)
+{
+    // reset = self._head
+    // try:
+    //     stack = self._parse(contexts.STYLE_ITALICS)
+    // except BadRoute as route:
+    //     self._head = reset
+    //     if route.context & contexts.STYLE_PASS_AGAIN:
+    //         stack = self._parse(route.context | contexts.STYLE_SECOND_PASS)
+    //     else:
+    //         return self._emit_text("''")
+    // self._emit_style_tag("i", "''", stack)
+}
+
+/*
+    Parse wiki-style bold.
+*/
+static int Tokenizer_parse_bold(Tokenizer* self)
+{
+    // reset = self._head
+    // try:
+    //     stack = self._parse(contexts.STYLE_BOLD)
+    // except BadRoute:
+    //     self._head = reset
+    //     if self._context & contexts.STYLE_SECOND_PASS:
+    //         self._emit_text("'")
+    //         return True  ## we can return 1 for this and -1 for errors (switch case)
+    //     elif self._context & contexts.STYLE_ITALICS:
+    //         self._context |= contexts.STYLE_PASS_AGAIN
+    //         self._emit_text("'''")
+    //     else:
+    //         self._emit_text("'")
+    //         self._parse_italics()
+    // else:
+    //     self._emit_style_tag("b", "'''", stack)
+}
+
+/*
+    Parse wiki-style italics and bold together (i.e., five ticks).
+*/
+static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
+{
+    // reset = self._head
+    // try:
+    //     stack = self._parse(contexts.STYLE_BOLD)
+    // except BadRoute:
+    //     self._head = reset
+    //     try:
+    //         stack = self._parse(contexts.STYLE_ITALICS)
+    //     except BadRoute:
+    //         self._head = reset
+    //         self._emit_text("'''''")
+    //     else:
+    //         reset = self._head
+    //         try:
+    //             stack2 = self._parse(contexts.STYLE_BOLD)
+    //         except BadRoute:
+    //             self._head = reset
+    //             self._emit_text("'''")
+    //             self._emit_style_tag("i", "''", stack)
+    //         else:
+    //             self._push()
+    //             self._emit_style_tag("i", "''", stack)
+    //             self._emit_all(stack2)
+    //             self._emit_style_tag("b", "'''", self._pop())
+    // else:
+    //     reset = self._head
+    //     try:
+    //         stack2 = self._parse(contexts.STYLE_ITALICS)
+    //     except BadRoute:
+    //         self._head = reset
+    //         self._emit_text("''")
+    //         self._emit_style_tag("b", "'''", stack)
+    //     else:
+    //         self._push()
+    //         self._emit_style_tag("b", "'''", stack)
+    //         self._emit_all(stack2)
+    //         self._emit_style_tag("i", "''", self._pop())
+}
+
+/*
+    Parse wiki-style formatting (''/''' for italics/bold).
+*/
+static PyObject* Tokenizer_parse_style(Tokenizer* self)
+{
+    // self._head += 2
+    // ticks = 2
+    // while self._read() == "'":
+    //     self._head += 1
+    //     ticks += 1
+    // italics = self._context & contexts.STYLE_ITALICS
+    // bold = self._context & contexts.STYLE_BOLD
+    // if ticks > 5:
+    //     self._emit_text("'" * (ticks - 5))
+    //     ticks = 5
+    // elif ticks == 4:
+    //     self._emit_text("'")
+    //     ticks = 3
+    // if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)):
+    //     if ticks == 5:
+    //         self._head -= 3 if italics else 2
+    //     return self._pop()
+    // elif not self._can_recurse():
+    //     if ticks == 3:
+    //         if self._context & contexts.STYLE_SECOND_PASS:
+    //             self._emit_text("'")
+    //             return self._pop()
+    //         self._context |= contexts.STYLE_PASS_AGAIN
+    //     self._emit_text("'" * ticks)
+    // elif ticks == 2:
+    //     self._parse_italics()
+    // elif ticks == 3:
+    //     if self._parse_bold():
+    //         return self._pop()
+    // elif ticks == 5:
+    //     self._parse_italics_and_bold()
+    // self._head -= 1
+    // ## we can return Py_None for non-error empty returns
+}
+
+/*
+    Handle a list marker at the head (#, *, ;, :).
+*/
+static int Tokenizer_handle_list_marker(Tokenizer* self)
+{
+    // markup = self._read()
+    // if markup == ";":
+    //     self._context |= contexts.DL_TERM
+    // self._emit(tokens.TagOpenOpen(wiki_markup=markup))
+    // self._emit_text(get_html_tag(markup))
+    // self._emit(tokens.TagCloseSelfclose())
+}
+
+/*
+    Handle a wiki-style list (#, *, ;, :).
+*/
+static int Tokenizer_handle_list(Tokenizer* self)
+{
+    // self._handle_list_marker()
+    // while self._read(1) in ("#", "*", ";", ":"):
+    //     self._head += 1
+    //     self._handle_list_marker()
+}
+
+/*
+    Handle a wiki-style horizontal rule (----) in the string.
+*/
+static int Tokenizer_handle_hr(Tokenizer* self)
+{
+    // length = 4
+    // self._head += 3
+    // while self._read(1) == "-":
+    //     length += 1
+    //     self._head += 1
+    // self._emit(tokens.TagOpenOpen(wiki_markup="-" * length))
+    // self._emit_text("hr")
+    // self._emit(tokens.TagCloseSelfclose())
+}
+
+/*
+    Handle the term in a description list ('foo' in ';foo:bar').
+*/
+static int Tokenizer_handle_dl_term(Tokenizer* self)
+{
+    // self._context ^= contexts.DL_TERM
+    // if self._read() == ":":
+    //     self._handle_list_marker()
+    // else:
+    //     self._emit_text("\n")
+}
+
+/*
     Handle the end of the stream of wikitext.
 */
 static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
 {
     static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
-                                LC_HEADING | LC_COMMENT | LC_TAG);
+                                LC_HEADING | LC_COMMENT | LC_TAG | LC_STYLE);
     static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
     PyObject *token, *text, *trash;
     int single;
@@ -1943,7 +2133,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
     static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
     int this_context, is_marker, i;
     Py_UNICODE this, next, next_next, last;
-    PyObject* trash;
+    PyObject* temp;
 
     if (push) {
         if (Tokenizer_push(self, context))
@@ -1955,8 +2145,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
         if (this_context & unsafe_contexts) {
             if (Tokenizer_verify_safe(self, this_context, this) < 0) {
                 if (this_context & double_unsafe) {
-                    trash = Tokenizer_pop(self);
-                    Py_XDECREF(trash);
+                    temp = Tokenizer_pop(self);
+                    Py_XDECREF(temp);
                 }
                 return Tokenizer_fail_route(self);
             }
@@ -1977,6 +2167,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
         if (this == *"")
             return Tokenizer_handle_end(self, this_context);
         next = Tokenizer_READ(self, 1);
+        last = Tokenizer_READ_BACKWARDS(self, 1);
         if (this_context & LC_COMMENT) {
             if (this == next && next == *"-") {
                 if (Tokenizer_READ(self, 2) == *">")
@@ -2030,7 +2221,6 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
         else if (this == next && next == *"]" && this_context & LC_WIKILINK)
             return Tokenizer_handle_wikilink_end(self);
         else if (this == *"=" && !(self->global & GL_HEADING)) {
-            last = Tokenizer_READ_BACKWARDS(self, 1);
             if (last == *"\n" || last == *"") {
                 if (Tokenizer_parse_heading(self))
                     return NULL;
@@ -2077,6 +2267,29 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
         }
         else if (this == *">" && this_context & LC_TAG_CLOSE)
             return Tokenizer_handle_tag_close_close(self);
+        else if (this == next && next == *"'") {
+            temp = Tokenizer_parse_style(self);
+            if (temp)
+                return temp;
+        }
+        else if (last == *"\n" || last == *"") {
+            if (this == *"#" || this == *"*" || this == *";" || this == *":") {
+                if (Tokenizer_handle_list(self))
+                    return NULL;
+            }
+            else if (this == *"-" && this == next &&
+                     this == Tokenizer_READ(self, 2) &&
+                     this == Tokenizer_READ(self, 3)) {
+                if (Tokenizer_handle_hr(self))
+                    return NULL;
+            }
+            else if (Tokenizer_emit_text(self, this))
+                return NULL;
+        }
+        else if ((this == *"\n" || this == *":") && this_context & LC_DLTERM) {
+            if (Tokenizer_handle_dl_term(self))
+                return NULL;
+        }
         else if (Tokenizer_emit_text(self, this))
             return NULL;
         self->head++;
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index c42f5f9..29e8fbe 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -41,20 +41,21 @@ SOFTWARE.
 #define ALPHANUM  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 static const char* MARKERS[] = {
-    "{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
-    "\n", ""};
+    "{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", ":", "/",
+    "-", "\n", ""};
 
-#define NUM_MARKERS 17
+#define NUM_MARKERS 18
 #define TEXTBUFFER_BLOCKSIZE 1024
 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000
 #define MAX_BRACES 255
 #define MAX_ENTITY_SIZE 8
 
-static int route_state = 0;
-#define BAD_ROUTE     (route_state)
-#define FAIL_ROUTE()  (route_state = 1)
-#define RESET_ROUTE() (route_state = 0)
+static int route_state = 0, route_context = 0;
+#define BAD_ROUTE            route_state
+#define BAD_ROUTE_CONTEXT    route_context
+#define FAIL_ROUTE(context)  route_state = 1; route_context = context
+#define RESET_ROUTE()        route_state = 0
 
 static char** entitydefs;
 
@@ -102,42 +103,50 @@ static PyObject* TagCloseClose;
 
 /* Local contexts: */
 
-#define LC_TEMPLATE             0x000007
-#define LC_TEMPLATE_NAME        0x000001
-#define LC_TEMPLATE_PARAM_KEY   0x000002
-#define LC_TEMPLATE_PARAM_VALUE 0x000004
-
-#define LC_ARGUMENT             0x000018
-#define LC_ARGUMENT_NAME        0x000008
-#define LC_ARGUMENT_DEFAULT     0x000010
-
-#define LC_WIKILINK             0x000060
-#define LC_WIKILINK_TITLE       0x000020
-#define LC_WIKILINK_TEXT        0x000040
-
-#define LC_HEADING              0x001F80
-#define LC_HEADING_LEVEL_1      0x000080
-#define LC_HEADING_LEVEL_2      0x000100
-#define LC_HEADING_LEVEL_3      0x000200
-#define LC_HEADING_LEVEL_4      0x000400
-#define LC_HEADING_LEVEL_5      0x000800
-#define LC_HEADING_LEVEL_6      0x001000
-
-#define LC_COMMENT              0x002000
-
-#define LC_TAG                  0x03C000
-#define LC_TAG_OPEN             0x004000
-#define LC_TAG_ATTR             0x008000
-#define LC_TAG_BODY             0x010000
-#define LC_TAG_CLOSE            0x020000
-
-#define LC_SAFETY_CHECK         0xFC0000
-#define LC_HAS_TEXT             0x040000
-#define LC_FAIL_ON_TEXT         0x080000
-#define LC_FAIL_NEXT            0x100000
-#define LC_FAIL_ON_LBRACE       0x200000
-#define LC_FAIL_ON_RBRACE       0x400000
-#define LC_FAIL_ON_EQUALS       0x800000
+#define LC_TEMPLATE             0x00000007
+#define LC_TEMPLATE_NAME        0x00000001
+#define LC_TEMPLATE_PARAM_KEY   0x00000002
+#define LC_TEMPLATE_PARAM_VALUE 0x00000004
+
+#define LC_ARGUMENT             0x00000018
+#define LC_ARGUMENT_NAME        0x00000008
+#define LC_ARGUMENT_DEFAULT     0x00000010
+
+#define LC_WIKILINK             0x00000060
+#define LC_WIKILINK_TITLE       0x00000020
+#define LC_WIKILINK_TEXT        0x00000040
+
+#define LC_HEADING              0x00001F80
+#define LC_HEADING_LEVEL_1      0x00000080
+#define LC_HEADING_LEVEL_2      0x00000100
+#define LC_HEADING_LEVEL_3      0x00000200
+#define LC_HEADING_LEVEL_4      0x00000400
+#define LC_HEADING_LEVEL_5      0x00000800
+#define LC_HEADING_LEVEL_6      0x00001000
+
+#define LC_COMMENT              0x00002000
+
+#define LC_TAG                  0x0003C000
+#define LC_TAG_OPEN             0x00004000
+#define LC_TAG_ATTR             0x00008000
+#define LC_TAG_BODY             0x00010000
+#define LC_TAG_CLOSE            0x00020000
+
+#define LC_STYLE                0x003C0000
+#define LC_STYLE_ITALICS        0x00040000
+#define LC_STYLE_BOLD           0x00080000
+#define LC_STYLE_PASS_AGAIN     0x00100000
+#define LC_STYLE_SECOND_PASS    0x00200000
+
+#define LC_DLTERM               0x00400000
+
+#define LC_SAFETY_CHECK         0x1F800000
+#define LC_HAS_TEXT             0x00800000
+#define LC_FAIL_ON_TEXT         0x01000000
+#define LC_FAIL_NEXT            0x02000000
+#define LC_FAIL_ON_LBRACE       0x04000000
+#define LC_FAIL_ON_RBRACE       0x08000000
+#define LC_FAIL_ON_EQUALS       0x10000000
 
 /* Global contexts: */
 
@@ -211,6 +220,7 @@ typedef struct {
 
 /* Macros for accessing HTML tag definitions: */
 
+#define GET_HTML_TAG(markup) (call_tag_def_func("get_html_tag", markup))
 #define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag))
 #define IS_SINGLE(tag) (call_tag_def_func("is_single", tag))
 #define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))