Browse Source

Add C hooks and prototypes for wiki-markup tags.

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
9b98907751
2 changed files with 273 additions and 50 deletions
  1. +220
    -7
      mwparserfromhell/parser/tokenizer.c
  2. +53
    -43
      mwparserfromhell/parser/tokenizer.h

+ 220
- 7
mwparserfromhell/parser/tokenizer.c View File

@@ -325,9 +325,10 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
*/
static void* Tokenizer_fail_route(Tokenizer* self)
{
int context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self);
Py_XDECREF(stack);
FAIL_ROUTE();
FAIL_ROUTE(context);
return NULL;
}

@@ -1776,7 +1777,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
return -1;
}
if (!IS_SINGLE_ONLY(name))
FAIL_ROUTE();
FAIL_ROUTE(0);
break;
}
Textbuffer_write(&buf, this);
@@ -1823,12 +1824,201 @@ static int Tokenizer_parse_tag(Tokenizer* self)
}

/*
Write the body of a tag and the tokens that should surround it.
*/
static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks,
PyObject* body)
{
// self._emit(tokens.TagOpenOpen(wiki_markup=markup))
// self._emit_text(tag)
// self._emit(tokens.TagCloseOpen())
// self._emit_all(body)
// self._emit(tokens.TagOpenClose())
// self._emit_text(tag)
// self._emit(tokens.TagCloseClose())
}

/*
Parse wiki-style italics.
*/
static int Tokenizer_parse_italics(Tokenizer* self)
{
// reset = self._head
// try:
// stack = self._parse(contexts.STYLE_ITALICS)
// except BadRoute as route:
// self._head = reset
// if route.context & contexts.STYLE_PASS_AGAIN:
// stack = self._parse(route.context | contexts.STYLE_SECOND_PASS)
// else:
// return self._emit_text("''")
// self._emit_style_tag("i", "''", stack)
}

/*
Parse wiki-style bold.
*/
static int Tokenizer_parse_bold(Tokenizer* self)
{
// reset = self._head
// try:
// stack = self._parse(contexts.STYLE_BOLD)
// except BadRoute:
// self._head = reset
// if self._context & contexts.STYLE_SECOND_PASS:
// self._emit_text("'")
// return True ## we can return 1 for this and -1 for errors (switch case)
// elif self._context & contexts.STYLE_ITALICS:
// self._context |= contexts.STYLE_PASS_AGAIN
// self._emit_text("'''")
// else:
// self._emit_text("'")
// self._parse_italics()
// else:
// self._emit_style_tag("b", "'''", stack)
}

/*
Parse wiki-style italics and bold together (i.e., five ticks).
*/
static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
{
// reset = self._head
// try:
// stack = self._parse(contexts.STYLE_BOLD)
// except BadRoute:
// self._head = reset
// try:
// stack = self._parse(contexts.STYLE_ITALICS)
// except BadRoute:
// self._head = reset
// self._emit_text("'''''")
// else:
// reset = self._head
// try:
// stack2 = self._parse(contexts.STYLE_BOLD)
// except BadRoute:
// self._head = reset
// self._emit_text("'''")
// self._emit_style_tag("i", "''", stack)
// else:
// self._push()
// self._emit_style_tag("i", "''", stack)
// self._emit_all(stack2)
// self._emit_style_tag("b", "'''", self._pop())
// else:
// reset = self._head
// try:
// stack2 = self._parse(contexts.STYLE_ITALICS)
// except BadRoute:
// self._head = reset
// self._emit_text("''")
// self._emit_style_tag("b", "'''", stack)
// else:
// self._push()
// self._emit_style_tag("b", "'''", stack)
// self._emit_all(stack2)
// self._emit_style_tag("i", "''", self._pop())
}

/*
Parse wiki-style formatting (''/''' for italics/bold).
*/
static PyObject* Tokenizer_parse_style(Tokenizer* self)
{
// self._head += 2
// ticks = 2
// while self._read() == "'":
// self._head += 1
// ticks += 1
// italics = self._context & contexts.STYLE_ITALICS
// bold = self._context & contexts.STYLE_BOLD
// if ticks > 5:
// self._emit_text("'" * (ticks - 5))
// ticks = 5
// elif ticks == 4:
// self._emit_text("'")
// ticks = 3
// if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)):
// if ticks == 5:
// self._head -= 3 if italics else 2
// return self._pop()
// elif not self._can_recurse():
// if ticks == 3:
// if self._context & contexts.STYLE_SECOND_PASS:
// self._emit_text("'")
// return self._pop()
// self._context |= contexts.STYLE_PASS_AGAIN
// self._emit_text("'" * ticks)
// elif ticks == 2:
// self._parse_italics()
// elif ticks == 3:
// if self._parse_bold():
// return self._pop()
// elif ticks == 5:
// self._parse_italics_and_bold()
// self._head -= 1
// ## we can return Py_None for non-error empty returns
}

/*
Handle a list marker at the head (#, *, ;, :).
*/
static int Tokenizer_handle_list_marker(Tokenizer* self)
{
// markup = self._read()
// if markup == ";":
// self._context |= contexts.DL_TERM
// self._emit(tokens.TagOpenOpen(wiki_markup=markup))
// self._emit_text(get_html_tag(markup))
// self._emit(tokens.TagCloseSelfclose())
}

/*
Handle a wiki-style list (#, *, ;, :).
*/
static int Tokenizer_handle_list(Tokenizer* self)
{
// self._handle_list_marker()
// while self._read(1) in ("#", "*", ";", ":"):
// self._head += 1
// self._handle_list_marker()
}

/*
Handle a wiki-style horizontal rule (----) in the string.
*/
static int Tokenizer_handle_hr(Tokenizer* self)
{
// length = 4
// self._head += 3
// while self._read(1) == "-":
// length += 1
// self._head += 1
// self._emit(tokens.TagOpenOpen(wiki_markup="-" * length))
// self._emit_text("hr")
// self._emit(tokens.TagCloseSelfclose())
}

/*
Handle the term in a description list ('foo' in ';foo:bar').
*/
static int Tokenizer_handle_dl_term(Tokenizer* self)
{
// self._context ^= contexts.DL_TERM
// if self._read() == ":":
// self._handle_list_marker()
// else:
// self._emit_text("\n")
}

/*
Handle the end of the stream of wikitext.
*/
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
{
static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
LC_HEADING | LC_COMMENT | LC_TAG);
LC_HEADING | LC_COMMENT | LC_TAG | LC_STYLE);
static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
PyObject *token, *text, *trash;
int single;
@@ -1943,7 +2133,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
int this_context, is_marker, i;
Py_UNICODE this, next, next_next, last;
PyObject* trash;
PyObject* temp;

if (push) {
if (Tokenizer_push(self, context))
@@ -1955,8 +2145,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (this_context & unsafe_contexts) {
if (Tokenizer_verify_safe(self, this_context, this) < 0) {
if (this_context & double_unsafe) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
temp = Tokenizer_pop(self);
Py_XDECREF(temp);
}
return Tokenizer_fail_route(self);
}
@@ -1977,6 +2167,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (this == *"")
return Tokenizer_handle_end(self, this_context);
next = Tokenizer_READ(self, 1);
last = Tokenizer_READ_BACKWARDS(self, 1);
if (this_context & LC_COMMENT) {
if (this == next && next == *"-") {
if (Tokenizer_READ(self, 2) == *">")
@@ -2030,7 +2221,6 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
else if (this == next && next == *"]" && this_context & LC_WIKILINK)
return Tokenizer_handle_wikilink_end(self);
else if (this == *"=" && !(self->global & GL_HEADING)) {
last = Tokenizer_READ_BACKWARDS(self, 1);
if (last == *"\n" || last == *"") {
if (Tokenizer_parse_heading(self))
return NULL;
@@ -2077,6 +2267,29 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
}
else if (this == *">" && this_context & LC_TAG_CLOSE)
return Tokenizer_handle_tag_close_close(self);
else if (this == next && next == *"'") {
temp = Tokenizer_parse_style(self);
if (temp)
return temp;
}
else if (last == *"\n" || last == *"") {
if (this == *"#" || this == *"*" || this == *";" || this == *":") {
if (Tokenizer_handle_list(self))
return NULL;
}
else if (this == *"-" && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3)) {
if (Tokenizer_handle_hr(self))
return NULL;
}
else if (Tokenizer_emit_text(self, this))
return NULL;
}
else if ((this == *"\n" || this == *":") && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
return NULL;
}
else if (Tokenizer_emit_text(self, this))
return NULL;
self->head++;


+ 53
- 43
mwparserfromhell/parser/tokenizer.h View File

@@ -41,20 +41,21 @@ SOFTWARE.
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

static const char* MARKERS[] = {
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
"\n", ""};
"{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", ":", "/",
"-", "\n", ""};

#define NUM_MARKERS 17
#define NUM_MARKERS 18
#define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40
#define MAX_CYCLES 100000
#define MAX_BRACES 255
#define MAX_ENTITY_SIZE 8

static int route_state = 0;
#define BAD_ROUTE (route_state)
#define FAIL_ROUTE() (route_state = 1)
#define RESET_ROUTE() (route_state = 0)
static int route_state = 0, route_context = 0;
#define BAD_ROUTE route_state
#define BAD_ROUTE_CONTEXT route_context
#define FAIL_ROUTE(context) route_state = 1; route_context = context
#define RESET_ROUTE() route_state = 0

static char** entitydefs;

@@ -102,42 +103,50 @@ static PyObject* TagCloseClose;

/* Local contexts: */

#define LC_TEMPLATE 0x000007
#define LC_TEMPLATE_NAME 0x000001
#define LC_TEMPLATE_PARAM_KEY 0x000002
#define LC_TEMPLATE_PARAM_VALUE 0x000004

#define LC_ARGUMENT 0x000018
#define LC_ARGUMENT_NAME 0x000008
#define LC_ARGUMENT_DEFAULT 0x000010

#define LC_WIKILINK 0x000060
#define LC_WIKILINK_TITLE 0x000020
#define LC_WIKILINK_TEXT 0x000040

#define LC_HEADING 0x001F80
#define LC_HEADING_LEVEL_1 0x000080
#define LC_HEADING_LEVEL_2 0x000100
#define LC_HEADING_LEVEL_3 0x000200
#define LC_HEADING_LEVEL_4 0x000400
#define LC_HEADING_LEVEL_5 0x000800
#define LC_HEADING_LEVEL_6 0x001000

#define LC_COMMENT 0x002000

#define LC_TAG 0x03C000
#define LC_TAG_OPEN 0x004000
#define LC_TAG_ATTR 0x008000
#define LC_TAG_BODY 0x010000
#define LC_TAG_CLOSE 0x020000

#define LC_SAFETY_CHECK 0xFC0000
#define LC_HAS_TEXT 0x040000
#define LC_FAIL_ON_TEXT 0x080000
#define LC_FAIL_NEXT 0x100000
#define LC_FAIL_ON_LBRACE 0x200000
#define LC_FAIL_ON_RBRACE 0x400000
#define LC_FAIL_ON_EQUALS 0x800000
#define LC_TEMPLATE 0x00000007
#define LC_TEMPLATE_NAME 0x00000001
#define LC_TEMPLATE_PARAM_KEY 0x00000002
#define LC_TEMPLATE_PARAM_VALUE 0x00000004

#define LC_ARGUMENT 0x00000018
#define LC_ARGUMENT_NAME 0x00000008
#define LC_ARGUMENT_DEFAULT 0x00000010

#define LC_WIKILINK 0x00000060
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_HEADING 0x00001F80
#define LC_HEADING_LEVEL_1 0x00000080
#define LC_HEADING_LEVEL_2 0x00000100
#define LC_HEADING_LEVEL_3 0x00000200
#define LC_HEADING_LEVEL_4 0x00000400
#define LC_HEADING_LEVEL_5 0x00000800
#define LC_HEADING_LEVEL_6 0x00001000

#define LC_COMMENT 0x00002000

#define LC_TAG 0x0003C000
#define LC_TAG_OPEN 0x00004000
#define LC_TAG_ATTR 0x00008000
#define LC_TAG_BODY 0x00010000
#define LC_TAG_CLOSE 0x00020000

#define LC_STYLE 0x003C0000
#define LC_STYLE_ITALICS 0x00040000
#define LC_STYLE_BOLD 0x00080000
#define LC_STYLE_PASS_AGAIN 0x00100000
#define LC_STYLE_SECOND_PASS 0x00200000

#define LC_DLTERM 0x00400000

#define LC_SAFETY_CHECK 0x1F800000
#define LC_HAS_TEXT 0x00800000
#define LC_FAIL_ON_TEXT 0x01000000
#define LC_FAIL_NEXT 0x02000000
#define LC_FAIL_ON_LBRACE 0x04000000
#define LC_FAIL_ON_RBRACE 0x08000000
#define LC_FAIL_ON_EQUALS 0x10000000

/* Global contexts: */

@@ -211,6 +220,7 @@ typedef struct {

/* Macros for accessing HTML tag definitions: */

#define GET_HTML_TAG(markup) (call_tag_def_func("get_html_tag", markup))
#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag))
#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag))
#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))


Loading…
Cancel
Save