瀏覽代碼

Add a emit_string() as a shortcut; a bunch of minor cleanup.

tags/v0.3
Ben Kurtovic 11 年之前
父節點
當前提交
c1379d5f21
共有 1 個檔案被更改,包括 60 行新增83 行删除
  1. +60
    -83
      mwparserfromhell/parser/tokenizer.c

+ 60
- 83
mwparserfromhell/parser/tokenizer.c 查看文件

@@ -29,6 +29,7 @@ SOFTWARE.
static int heading_level_from_context(int n) static int heading_level_from_context(int n)
{ {
int level; int level;

n /= LC_HEADING_LEVEL_1; n /= LC_HEADING_LEVEL_1;
for (level = 1; n > 1; n >>= 1) for (level = 1; n > 1; n >>= 1)
level++; level++;
@@ -72,6 +73,7 @@ static PyObject* strip_tag_name(PyObject* token)
static Textbuffer* Textbuffer_new(void) static Textbuffer* Textbuffer_new(void)
{ {
Textbuffer* buffer = malloc(sizeof(Textbuffer)); Textbuffer* buffer = malloc(sizeof(Textbuffer));

if (!buffer) { if (!buffer) {
PyErr_NoMemory(); PyErr_NoMemory();
return NULL; return NULL;
@@ -90,6 +92,7 @@ static Textbuffer* Textbuffer_new(void)
static void Textbuffer_dealloc(Textbuffer* self) static void Textbuffer_dealloc(Textbuffer* self)
{ {
Textbuffer* next; Textbuffer* next;

while (self) { while (self) {
free(self->data); free(self->data);
next = self->next; next = self->next;
@@ -104,6 +107,7 @@ static void Textbuffer_dealloc(Textbuffer* self)
static int Textbuffer_write(Textbuffer** this, Py_UNICODE text) static int Textbuffer_write(Textbuffer** this, Py_UNICODE text)
{ {
Textbuffer* self = *this; Textbuffer* self = *this;

if (self->size == TEXTBUFFER_BLOCKSIZE) { if (self->size == TEXTBUFFER_BLOCKSIZE) {
Textbuffer* new = Textbuffer_new(); Textbuffer* new = Textbuffer_new();
if (!new) if (!new)
@@ -123,6 +127,7 @@ static PyObject* Textbuffer_render(Textbuffer* self)
{ {
PyObject *result = PyUnicode_FromUnicode(self->data, self->size); PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
PyObject *left, *concat; PyObject *left, *concat;

while (self->next) { while (self->next) {
self = self->next; self = self->next;
left = PyUnicode_FromUnicode(self->data, self->size); left = PyUnicode_FromUnicode(self->data, self->size);
@@ -208,6 +213,7 @@ static void Tokenizer_dealloc(Tokenizer* self)
static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
{ {
static char* kwlist[] = {NULL}; static char* kwlist[] = {NULL};

if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
return -1; return -1;
self->text = Py_None; self->text = Py_None;
@@ -223,6 +229,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
static int Tokenizer_push(Tokenizer* self, int context) static int Tokenizer_push(Tokenizer* self, int context)
{ {
Stack* top = malloc(sizeof(Stack)); Stack* top = malloc(sizeof(Stack));

if (!top) { if (!top) {
PyErr_NoMemory(); PyErr_NoMemory();
return -1; return -1;
@@ -246,6 +253,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self)
{ {
PyObject *text, *kwargs, *token; PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer; Textbuffer* buffer = self->topstack->textbuffer;

if (buffer->size == 0 && !buffer->next) if (buffer->size == 0 && !buffer->next)
return 0; return 0;
text = Textbuffer_render(buffer); text = Textbuffer_render(buffer);
@@ -280,6 +288,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self)
static void Tokenizer_delete_top_of_stack(Tokenizer* self) static void Tokenizer_delete_top_of_stack(Tokenizer* self)
{ {
Stack* top = self->topstack; Stack* top = self->topstack;

Py_DECREF(top->stack); Py_DECREF(top->stack);
Textbuffer_dealloc(top->textbuffer); Textbuffer_dealloc(top->textbuffer);
self->topstack = top->next; self->topstack = top->next;
@@ -293,6 +302,7 @@ static void Tokenizer_delete_top_of_stack(Tokenizer* self)
static PyObject* Tokenizer_pop(Tokenizer* self) static PyObject* Tokenizer_pop(Tokenizer* self)
{ {
PyObject* stack; PyObject* stack;

if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return NULL; return NULL;
stack = self->topstack->stack; stack = self->topstack->stack;
@@ -309,6 +319,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{ {
PyObject* stack; PyObject* stack;
int context; int context;

if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return NULL; return NULL;
stack = self->topstack->stack; stack = self->topstack->stack;
@@ -327,6 +338,7 @@ static void* Tokenizer_fail_route(Tokenizer* self)
{ {
int context = self->topstack->context; int context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self); PyObject* stack = Tokenizer_pop(self);

Py_XDECREF(stack); Py_XDECREF(stack);
FAIL_ROUTE(context); FAIL_ROUTE(context);
return NULL; return NULL;
@@ -365,6 +377,21 @@ static int Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
} }


/* /*
Write a string of text to the current textbuffer.
*/
static int Tokenizer_emit_string(Tokenizer* self, const char* text)
{
int i = 0;

while (text[i]) {
if (Tokenizer_emit_text(self, text[i]))
return -1;
i++;
}
return 0;
}

/*
Write a series of tokens to the current stack at once. Write a series of tokens to the current stack at once.
*/ */
static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
@@ -428,15 +455,10 @@ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
{ {
PyObject* stack = Tokenizer_pop(self); PyObject* stack = Tokenizer_pop(self);
int i = 0;
while (1) {
if (!text[i])
break;
if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(stack);
return -1;
}
i++;

if (Tokenizer_emit_string(self, text)) {
Py_DECREF(stack);
return -1;
} }
if (stack) { if (stack) {
if (PyList_GET_SIZE(stack) > 0) { if (PyList_GET_SIZE(stack) > 0) {
@@ -457,6 +479,7 @@ static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
{ {
Py_ssize_t index = self->head + delta; Py_ssize_t index = self->head + delta;

if (index >= self->length) if (index >= self->length)
return EMPTY; return EMPTY;
return PyList_GET_ITEM(self->text, index); return PyList_GET_ITEM(self->text, index);
@@ -468,6 +491,7 @@ static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{ {
Py_ssize_t index; Py_ssize_t index;

if (delta > self->head) if (delta > self->head)
return EMPTY; return EMPTY;
index = self->head - delta; index = self->head - delta;
@@ -752,7 +776,6 @@ static int Tokenizer_parse_wikilink(Tokenizer* self)
{ {
Py_ssize_t reset; Py_ssize_t reset;
PyObject *wikilink, *token; PyObject *wikilink, *token;
int i;


self->head += 2; self->head += 2;
reset = self->head - 1; reset = self->head - 1;
@@ -760,10 +783,8 @@ static int Tokenizer_parse_wikilink(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
for (i = 0; i < 2; i++) {
if (Tokenizer_emit_text(self, *"["))
return -1;
}
if (Tokenizer_emit_string(self, "[["))
return -1;
return 0; return 0;
} }
if (!wikilink) if (!wikilink)
@@ -1183,24 +1204,14 @@ static int Tokenizer_parse_comment(Tokenizer* self)
{ {
Py_ssize_t reset = self->head + 3; Py_ssize_t reset = self->head + 3;
PyObject *token, *comment; PyObject *token, *comment;
int i;


self->head += 4; self->head += 4;
comment = Tokenizer_parse(self, LC_COMMENT, 1); comment = Tokenizer_parse(self, LC_COMMENT, 1);
if (BAD_ROUTE) { if (BAD_ROUTE) {
const char* text = "<!--";
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
i = 0;
while (1) {
if (!text[i])
return 0;
if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(text);
return -1;
}
i++;
}
if (Tokenizer_emit_string(self, "<!--"))
return -1;
return 0; return 0;
} }
if (!comment) if (!comment)
@@ -1791,8 +1802,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
return (Tokenizer_emit_text(self, *"<") ||
Tokenizer_emit_text(self, *"/"));
return Tokenizer_emit_string(self, "</");
} }
// Set invalid=True flag of TagOpenOpen // Set invalid=True flag of TagOpenOpen
if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True)) if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
@@ -1826,16 +1836,12 @@ static int Tokenizer_parse_tag(Tokenizer* self)
/* /*
Write the body of a tag and the tokens that should surround it. Write the body of a tag and the tokens that should surround it.
*/ */
static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks,
PyObject* body)
static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
const char* ticks, PyObject* body)
{ {
PyObject *markup, *kwargs, *token; PyObject *markup, *kwargs, *token;
char chr_markup[4];
int i;


for (i = 0; i < ticks; i++) chr_markup[i] = *"'";
chr_markup[ticks] = *"";
markup = PyBytes_FromString(chr_markup);
markup = PyBytes_FromString(ticks);
if (!markup) if (!markup)
return -1; return -1;
kwargs = PyDict_New(); kwargs = PyDict_New();
@@ -1856,7 +1862,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks,
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);
if (Tokenizer_emit_text(self, tag))
if (Tokenizer_emit_string(self, tag))
return -1; return -1;
token = PyObject_CallObject(TagCloseOpen, NULL); token = PyObject_CallObject(TagCloseOpen, NULL);
if (!token) if (!token)
@@ -1876,7 +1882,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks,
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);
if (Tokenizer_emit_text(self, tag))
if (Tokenizer_emit_string(self, tag))
return -1; return -1;
token = PyObject_CallObject(TagCloseClose, NULL); token = PyObject_CallObject(TagCloseClose, NULL);
if (!token) if (!token)
@@ -1907,15 +1913,12 @@ static int Tokenizer_parse_italics(Tokenizer* self)
context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS; context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS;
stack = Tokenizer_parse(self, context, 1); stack = Tokenizer_parse(self, context, 1);
} }
else {
if (Tokenizer_emit_text(self, *"'"))
return -1;
return Tokenizer_emit_text(self, *"'");
}
else
return Tokenizer_emit_string(self, "''");
} }
if (!stack) if (!stack)
return -1; return -1;
return Tokenizer_emit_style_tag(self, *"i", 2, stack);
return Tokenizer_emit_style_tag(self, "i", "''", stack);
} }


/* /*
@@ -1934,11 +1937,7 @@ static int Tokenizer_parse_bold(Tokenizer* self)
return Tokenizer_emit_text(self, *"'") ? -1 : 1; return Tokenizer_emit_text(self, *"'") ? -1 : 1;
if (self->topstack->context & LC_STYLE_ITALICS) { if (self->topstack->context & LC_STYLE_ITALICS) {
self->topstack->context |= LC_STYLE_PASS_AGAIN; self->topstack->context |= LC_STYLE_PASS_AGAIN;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
return Tokenizer_emit_text(self, *"'");
return Tokenizer_emit_string(self, "'''");
} }
if (Tokenizer_emit_text(self, *"'")) if (Tokenizer_emit_text(self, *"'"))
return -1; return -1;
@@ -1946,7 +1945,7 @@ static int Tokenizer_parse_bold(Tokenizer* self)
} }
if (!stack) if (!stack)
return -1; return -1;
return Tokenizer_emit_style_tag(self, *"b", 3, stack);
return Tokenizer_emit_style_tag(self, "b", "'''", stack);
} }


/* /*
@@ -1965,15 +1964,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
return Tokenizer_emit_text(self, *"'");
return Tokenizer_emit_string(self, "'''''");
} }
if (!stack) if (!stack)
return -1; return -1;
@@ -1982,19 +1973,15 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
if (Tokenizer_emit_string(self, "'''"))
return -1; return -1;
return Tokenizer_emit_style_tag(self, *"i", 2, stack);
return Tokenizer_emit_style_tag(self, "i", "''", stack);
} }
if (!stack2) if (!stack2)
return -1; return -1;
if (Tokenizer_push(self, 0)) if (Tokenizer_push(self, 0))
return -1; return -1;
if (Tokenizer_emit_style_tag(self, *"i", 2, stack))
if (Tokenizer_emit_style_tag(self, "i", "''", stack))
return -1; return -1;
if (Tokenizer_emit_all(self, stack2)) if (Tokenizer_emit_all(self, stack2))
return -1; return -1;
@@ -2002,7 +1989,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
stack2 = Tokenizer_pop(self); stack2 = Tokenizer_pop(self);
if (!stack2) if (!stack2)
return -1; return -1;
return Tokenizer_emit_style_tag(self, *"b", 3, stack2);
return Tokenizer_emit_style_tag(self, "b", "'''", stack2);
} }
if (!stack) if (!stack)
return -1; return -1;
@@ -2011,17 +1998,15 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
if (Tokenizer_emit_text(self, *"'"))
if (Tokenizer_emit_string(self, "''"))
return -1; return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
return Tokenizer_emit_style_tag(self, *"b", 3, stack);
return Tokenizer_emit_style_tag(self, "b", "'''", stack);
} }
if (!stack2) if (!stack2)
return -1; return -1;
if (Tokenizer_push(self, 0)) if (Tokenizer_push(self, 0))
return -1; return -1;
if (Tokenizer_emit_style_tag(self, *"b", 3, stack))
if (Tokenizer_emit_style_tag(self, "b", "'''", stack))
return -1; return -1;
if (Tokenizer_emit_all(self, stack2)) if (Tokenizer_emit_all(self, stack2))
return -1; return -1;
@@ -2029,7 +2014,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
stack2 = Tokenizer_pop(self); stack2 = Tokenizer_pop(self);
if (!stack2) if (!stack2)
return -1; return -1;
return Tokenizer_emit_style_tag(self, *"i", 2, stack2);
return Tokenizer_emit_style_tag(self, "i", "''", stack2);
} }


/* /*
@@ -2103,8 +2088,6 @@ static int Tokenizer_handle_list_marker(Tokenizer* self)
{ {
PyObject *markup = Tokenizer_read(self, 0), *kwargs, *token; PyObject *markup = Tokenizer_read(self, 0), *kwargs, *token;
Py_UNICODE code = *PyUnicode_AS_UNICODE(markup); Py_UNICODE code = *PyUnicode_AS_UNICODE(markup);
char *html;
int i = 0;


if (code == *";") if (code == *";")
self->topstack->context |= LC_DLTERM; self->topstack->context |= LC_DLTERM;
@@ -2123,12 +2106,8 @@ static int Tokenizer_handle_list_marker(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);
html = GET_HTML_TAG(code);
while (html[i]) {
if (Tokenizer_emit_text(self, html[i]))
return -1;
i++;
}
if (Tokenizer_emit_string(self, GET_HTML_TAG(code)))
return -1;
token = PyObject_CallObject(TagCloseSelfclose, NULL); token = PyObject_CallObject(TagCloseSelfclose, NULL);
if (!token) if (!token)
return -1; return -1;
@@ -2200,9 +2179,7 @@ static int Tokenizer_handle_hr(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);
if (Tokenizer_emit_text(self, *"h"))
return -1;
if (Tokenizer_emit_text(self, *"r"))
if (Tokenizer_emit_string(self, "hr"))
return -1; return -1;
token = PyObject_CallObject(TagCloseSelfclose, NULL); token = PyObject_CallObject(TagCloseSelfclose, NULL);
if (!token) if (!token)


Loading…
取消
儲存