From 9b98907751c28c48e0a2ff97583c26f371948128 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 20:55:35 -0400 Subject: [PATCH] Add C hooks and prototypes for wiki-markup tags. --- mwparserfromhell/parser/tokenizer.c | 227 ++++++++++++++++++++++++++++++++++-- mwparserfromhell/parser/tokenizer.h | 96 ++++++++------- 2 files changed, 273 insertions(+), 50 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index bae5ec2..be996ad 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -325,9 +325,10 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) */ static void* Tokenizer_fail_route(Tokenizer* self) { + int context = self->topstack->context; PyObject* stack = Tokenizer_pop(self); Py_XDECREF(stack); - FAIL_ROUTE(); + FAIL_ROUTE(context); return NULL; } @@ -1776,7 +1777,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) return -1; } if (!IS_SINGLE_ONLY(name)) - FAIL_ROUTE(); + FAIL_ROUTE(0); break; } Textbuffer_write(&buf, this); @@ -1823,12 +1824,201 @@ static int Tokenizer_parse_tag(Tokenizer* self) } /* + Write the body of a tag and the tokens that should surround it. +*/ +static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks, + PyObject* body) +{ + // self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + // self._emit_text(tag) + // self._emit(tokens.TagCloseOpen()) + // self._emit_all(body) + // self._emit(tokens.TagOpenClose()) + // self._emit_text(tag) + // self._emit(tokens.TagCloseClose()) +} + +/* + Parse wiki-style italics. +*/ +static int Tokenizer_parse_italics(Tokenizer* self) +{ + // reset = self._head + // try: + // stack = self._parse(contexts.STYLE_ITALICS) + // except BadRoute as route: + // self._head = reset + // if route.context & contexts.STYLE_PASS_AGAIN: + // stack = self._parse(route.context | contexts.STYLE_SECOND_PASS) + // else: + // return self._emit_text("''") + // self._emit_style_tag("i", "''", stack) +} + +/* + Parse wiki-style bold. +*/ +static int Tokenizer_parse_bold(Tokenizer* self) +{ + // reset = self._head + // try: + // stack = self._parse(contexts.STYLE_BOLD) + // except BadRoute: + // self._head = reset + // if self._context & contexts.STYLE_SECOND_PASS: + // self._emit_text("'") + // return True ## we can return 1 for this and -1 for errors (switch case) + // elif self._context & contexts.STYLE_ITALICS: + // self._context |= contexts.STYLE_PASS_AGAIN + // self._emit_text("'''") + // else: + // self._emit_text("'") + // self._parse_italics() + // else: + // self._emit_style_tag("b", "'''", stack) +} + +/* + Parse wiki-style italics and bold together (i.e., five ticks). +*/ +static int Tokenizer_parse_italics_and_bold(Tokenizer* self) +{ + // reset = self._head + // try: + // stack = self._parse(contexts.STYLE_BOLD) + // except BadRoute: + // self._head = reset + // try: + // stack = self._parse(contexts.STYLE_ITALICS) + // except BadRoute: + // self._head = reset + // self._emit_text("'''''") + // else: + // reset = self._head + // try: + // stack2 = self._parse(contexts.STYLE_BOLD) + // except BadRoute: + // self._head = reset + // self._emit_text("'''") + // self._emit_style_tag("i", "''", stack) + // else: + // self._push() + // self._emit_style_tag("i", "''", stack) + // self._emit_all(stack2) + // self._emit_style_tag("b", "'''", self._pop()) + // else: + // reset = self._head + // try: + // stack2 = self._parse(contexts.STYLE_ITALICS) + // except BadRoute: + // self._head = reset + // self._emit_text("''") + // self._emit_style_tag("b", "'''", stack) + // else: + // self._push() + // self._emit_style_tag("b", "'''", stack) + // self._emit_all(stack2) + // self._emit_style_tag("i", "''", self._pop()) +} + +/* + Parse wiki-style formatting (''/''' for italics/bold). +*/ +static PyObject* Tokenizer_parse_style(Tokenizer* self) +{ + // self._head += 2 + // ticks = 2 + // while self._read() == "'": + // self._head += 1 + // ticks += 1 + // italics = self._context & contexts.STYLE_ITALICS + // bold = self._context & contexts.STYLE_BOLD + // if ticks > 5: + // self._emit_text("'" * (ticks - 5)) + // ticks = 5 + // elif ticks == 4: + // self._emit_text("'") + // ticks = 3 + // if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)): + // if ticks == 5: + // self._head -= 3 if italics else 2 + // return self._pop() + // elif not self._can_recurse(): + // if ticks == 3: + // if self._context & contexts.STYLE_SECOND_PASS: + // self._emit_text("'") + // return self._pop() + // self._context |= contexts.STYLE_PASS_AGAIN + // self._emit_text("'" * ticks) + // elif ticks == 2: + // self._parse_italics() + // elif ticks == 3: + // if self._parse_bold(): + // return self._pop() + // elif ticks == 5: + // self._parse_italics_and_bold() + // self._head -= 1 + // ## we can return Py_None for non-error empty returns +} + +/* + Handle a list marker at the head (#, *, ;, :). +*/ +static int Tokenizer_handle_list_marker(Tokenizer* self) +{ + // markup = self._read() + // if markup == ";": + // self._context |= contexts.DL_TERM + // self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + // self._emit_text(get_html_tag(markup)) + // self._emit(tokens.TagCloseSelfclose()) +} + +/* + Handle a wiki-style list (#, *, ;, :). +*/ +static int Tokenizer_handle_list(Tokenizer* self) +{ + // self._handle_list_marker() + // while self._read(1) in ("#", "*", ";", ":"): + // self._head += 1 + // self._handle_list_marker() +} + +/* + Handle a wiki-style horizontal rule (----) in the string. +*/ +static int Tokenizer_handle_hr(Tokenizer* self) +{ + // length = 4 + // self._head += 3 + // while self._read(1) == "-": + // length += 1 + // self._head += 1 + // self._emit(tokens.TagOpenOpen(wiki_markup="-" * length)) + // self._emit_text("hr") + // self._emit(tokens.TagCloseSelfclose()) +} + +/* + Handle the term in a description list ('foo' in ';foo:bar'). +*/ +static int Tokenizer_handle_dl_term(Tokenizer* self) +{ + // self._context ^= contexts.DL_TERM + // if self._read() == ":": + // self._handle_list_marker() + // else: + // self._emit_text("\n") +} + +/* Handle the end of the stream of wikitext. */ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) { static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | - LC_HEADING | LC_COMMENT | LC_TAG); + LC_HEADING | LC_COMMENT | LC_TAG | LC_STYLE); static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE); PyObject *token, *text, *trash; int single; @@ -1943,7 +2133,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE); int this_context, is_marker, i; Py_UNICODE this, next, next_next, last; - PyObject* trash; + PyObject* temp; if (push) { if (Tokenizer_push(self, context)) @@ -1955,8 +2145,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (this_context & unsafe_contexts) { if (Tokenizer_verify_safe(self, this_context, this) < 0) { if (this_context & double_unsafe) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); + temp = Tokenizer_pop(self); + Py_XDECREF(temp); } return Tokenizer_fail_route(self); } @@ -1977,6 +2167,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (this == *"") return Tokenizer_handle_end(self, this_context); next = Tokenizer_READ(self, 1); + last = Tokenizer_READ_BACKWARDS(self, 1); if (this_context & LC_COMMENT) { if (this == next && next == *"-") { if (Tokenizer_READ(self, 2) == *">") @@ -2030,7 +2221,6 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) else if (this == next && next == *"]" && this_context & LC_WIKILINK) return Tokenizer_handle_wikilink_end(self); else if (this == *"=" && !(self->global & GL_HEADING)) { - last = Tokenizer_READ_BACKWARDS(self, 1); if (last == *"\n" || last == *"") { if (Tokenizer_parse_heading(self)) return NULL; @@ -2077,6 +2267,29 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) } else if (this == *">" && this_context & LC_TAG_CLOSE) return Tokenizer_handle_tag_close_close(self); + else if (this == next && next == *"'") { + temp = Tokenizer_parse_style(self); + if (temp) + return temp; + } + else if (last == *"\n" || last == *"") { + if (this == *"#" || this == *"*" || this == *";" || this == *":") { + if (Tokenizer_handle_list(self)) + return NULL; + } + else if (this == *"-" && this == next && + this == Tokenizer_READ(self, 2) && + this == Tokenizer_READ(self, 3)) { + if (Tokenizer_handle_hr(self)) + return NULL; + } + else if (Tokenizer_emit_text(self, this)) + return NULL; + } + else if ((this == *"\n" || this == *":") && this_context & LC_DLTERM) { + if (Tokenizer_handle_dl_term(self)) + return NULL; + } else if (Tokenizer_emit_text(self, this)) return NULL; self->head++; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index c42f5f9..29e8fbe 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -41,20 +41,21 @@ SOFTWARE. #define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" static const char* MARKERS[] = { - "{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-", - "\n", ""}; + "{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", ":", "/", + "-", "\n", ""}; -#define NUM_MARKERS 17 +#define NUM_MARKERS 18 #define TEXTBUFFER_BLOCKSIZE 1024 #define MAX_DEPTH 40 #define MAX_CYCLES 100000 #define MAX_BRACES 255 #define MAX_ENTITY_SIZE 8 -static int route_state = 0; -#define BAD_ROUTE (route_state) -#define FAIL_ROUTE() (route_state = 1) -#define RESET_ROUTE() (route_state = 0) +static int route_state = 0, route_context = 0; +#define BAD_ROUTE route_state +#define BAD_ROUTE_CONTEXT route_context +#define FAIL_ROUTE(context) route_state = 1; route_context = context +#define RESET_ROUTE() route_state = 0 static char** entitydefs; @@ -102,42 +103,50 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x000007 -#define LC_TEMPLATE_NAME 0x000001 -#define LC_TEMPLATE_PARAM_KEY 0x000002 -#define LC_TEMPLATE_PARAM_VALUE 0x000004 - -#define LC_ARGUMENT 0x000018 -#define LC_ARGUMENT_NAME 0x000008 -#define LC_ARGUMENT_DEFAULT 0x000010 - -#define LC_WIKILINK 0x000060 -#define LC_WIKILINK_TITLE 0x000020 -#define LC_WIKILINK_TEXT 0x000040 - -#define LC_HEADING 0x001F80 -#define LC_HEADING_LEVEL_1 0x000080 -#define LC_HEADING_LEVEL_2 0x000100 -#define LC_HEADING_LEVEL_3 0x000200 -#define LC_HEADING_LEVEL_4 0x000400 -#define LC_HEADING_LEVEL_5 0x000800 -#define LC_HEADING_LEVEL_6 0x001000 - -#define LC_COMMENT 0x002000 - -#define LC_TAG 0x03C000 -#define LC_TAG_OPEN 0x004000 -#define LC_TAG_ATTR 0x008000 -#define LC_TAG_BODY 0x010000 -#define LC_TAG_CLOSE 0x020000 - -#define LC_SAFETY_CHECK 0xFC0000 -#define LC_HAS_TEXT 0x040000 -#define LC_FAIL_ON_TEXT 0x080000 -#define LC_FAIL_NEXT 0x100000 -#define LC_FAIL_ON_LBRACE 0x200000 -#define LC_FAIL_ON_RBRACE 0x400000 -#define LC_FAIL_ON_EQUALS 0x800000 +#define LC_TEMPLATE 0x00000007 +#define LC_TEMPLATE_NAME 0x00000001 +#define LC_TEMPLATE_PARAM_KEY 0x00000002 +#define LC_TEMPLATE_PARAM_VALUE 0x00000004 + +#define LC_ARGUMENT 0x00000018 +#define LC_ARGUMENT_NAME 0x00000008 +#define LC_ARGUMENT_DEFAULT 0x00000010 + +#define LC_WIKILINK 0x00000060 +#define LC_WIKILINK_TITLE 0x00000020 +#define LC_WIKILINK_TEXT 0x00000040 + +#define LC_HEADING 0x00001F80 +#define LC_HEADING_LEVEL_1 0x00000080 +#define LC_HEADING_LEVEL_2 0x00000100 +#define LC_HEADING_LEVEL_3 0x00000200 +#define LC_HEADING_LEVEL_4 0x00000400 +#define LC_HEADING_LEVEL_5 0x00000800 +#define LC_HEADING_LEVEL_6 0x00001000 + +#define LC_COMMENT 0x00002000 + +#define LC_TAG 0x0003C000 +#define LC_TAG_OPEN 0x00004000 +#define LC_TAG_ATTR 0x00008000 +#define LC_TAG_BODY 0x00010000 +#define LC_TAG_CLOSE 0x00020000 + +#define LC_STYLE 0x003C0000 +#define LC_STYLE_ITALICS 0x00040000 +#define LC_STYLE_BOLD 0x00080000 +#define LC_STYLE_PASS_AGAIN 0x00100000 +#define LC_STYLE_SECOND_PASS 0x00200000 + +#define LC_DLTERM 0x00400000 + +#define LC_SAFETY_CHECK 0x1F800000 +#define LC_HAS_TEXT 0x00800000 +#define LC_FAIL_ON_TEXT 0x01000000 +#define LC_FAIL_NEXT 0x02000000 +#define LC_FAIL_ON_LBRACE 0x04000000 +#define LC_FAIL_ON_RBRACE 0x08000000 +#define LC_FAIL_ON_EQUALS 0x10000000 /* Global contexts: */ @@ -211,6 +220,7 @@ typedef struct { /* Macros for accessing HTML tag definitions: */ +#define GET_HTML_TAG(markup) (call_tag_def_func("get_html_tag", markup)) #define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) #define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) #define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))