Sfoglia il codice sorgente

Add a emit_string() as a shortcut; a bunch of minor cleanup.

tags/v0.3
Ben Kurtovic 10 anni fa
parent
commit
c1379d5f21
1 ha cambiato i file con 60 aggiunte e 83 eliminazioni
  1. +60
    -83
      mwparserfromhell/parser/tokenizer.c

+ 60
- 83
mwparserfromhell/parser/tokenizer.c Vedi File

@@ -29,6 +29,7 @@ SOFTWARE.
static int heading_level_from_context(int n)
{
int level;

n /= LC_HEADING_LEVEL_1;
for (level = 1; n > 1; n >>= 1)
level++;
@@ -72,6 +73,7 @@ static PyObject* strip_tag_name(PyObject* token)
static Textbuffer* Textbuffer_new(void)
{
Textbuffer* buffer = malloc(sizeof(Textbuffer));

if (!buffer) {
PyErr_NoMemory();
return NULL;
@@ -90,6 +92,7 @@ static Textbuffer* Textbuffer_new(void)
static void Textbuffer_dealloc(Textbuffer* self)
{
Textbuffer* next;

while (self) {
free(self->data);
next = self->next;
@@ -104,6 +107,7 @@ static void Textbuffer_dealloc(Textbuffer* self)
static int Textbuffer_write(Textbuffer** this, Py_UNICODE text)
{
Textbuffer* self = *this;

if (self->size == TEXTBUFFER_BLOCKSIZE) {
Textbuffer* new = Textbuffer_new();
if (!new)
@@ -123,6 +127,7 @@ static PyObject* Textbuffer_render(Textbuffer* self)
{
PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
PyObject *left, *concat;

while (self->next) {
self = self->next;
left = PyUnicode_FromUnicode(self->data, self->size);
@@ -208,6 +213,7 @@ static void Tokenizer_dealloc(Tokenizer* self)
static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
{
static char* kwlist[] = {NULL};

if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
return -1;
self->text = Py_None;
@@ -223,6 +229,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
static int Tokenizer_push(Tokenizer* self, int context)
{
Stack* top = malloc(sizeof(Stack));

if (!top) {
PyErr_NoMemory();
return -1;
@@ -246,6 +253,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self)
{
PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer;

if (buffer->size == 0 && !buffer->next)
return 0;
text = Textbuffer_render(buffer);
@@ -280,6 +288,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self)
static void Tokenizer_delete_top_of_stack(Tokenizer* self)
{
Stack* top = self->topstack;

Py_DECREF(top->stack);
Textbuffer_dealloc(top->textbuffer);
self->topstack = top->next;
@@ -293,6 +302,7 @@ static void Tokenizer_delete_top_of_stack(Tokenizer* self)
static PyObject* Tokenizer_pop(Tokenizer* self)
{
PyObject* stack;

if (Tokenizer_push_textbuffer(self))
return NULL;
stack = self->topstack->stack;
@@ -309,6 +319,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{
PyObject* stack;
int context;

if (Tokenizer_push_textbuffer(self))
return NULL;
stack = self->topstack->stack;
@@ -327,6 +338,7 @@ static void* Tokenizer_fail_route(Tokenizer* self)
{
int context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self);

Py_XDECREF(stack);
FAIL_ROUTE(context);
return NULL;
@@ -365,6 +377,21 @@ static int Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
}

/*
Write a string of text to the current textbuffer.
*/
static int Tokenizer_emit_string(Tokenizer* self, const char* text)
{
int i = 0;

while (text[i]) {
if (Tokenizer_emit_text(self, text[i]))
return -1;
i++;
}
return 0;
}

/*
Write a series of tokens to the current stack at once.
*/
static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
@@ -428,15 +455,10 @@ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
{
PyObject* stack = Tokenizer_pop(self);
int i = 0;
while (1) {
if (!text[i])
break;
if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(stack);
return -1;
}
i++;

if (Tokenizer_emit_string(self, text)) {
Py_DECREF(stack);
return -1;
}
if (stack) {
if (PyList_GET_SIZE(stack) > 0) {
@@ -457,6 +479,7 @@ static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index = self->head + delta;

if (index >= self->length)
return EMPTY;
return PyList_GET_ITEM(self->text, index);
@@ -468,6 +491,7 @@ static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index;

if (delta > self->head)
return EMPTY;
index = self->head - delta;
@@ -752,7 +776,6 @@ static int Tokenizer_parse_wikilink(Tokenizer* self)
{
Py_ssize_t reset;
PyObject *wikilink, *token;
int i;

self->head += 2;
reset = self->head - 1;
@@ -760,10 +783,8 @@ static int Tokenizer_parse_wikilink(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
for (i = 0; i < 2; i++) {
if (Tokenizer_emit_text(self, *"["))
return -1;
}
if (Tokenizer_emit_string(self, "[["))
return -1;
return 0;
}
if (!wikilink)
@@ -1183,24 +1204,14 @@ static int Tokenizer_parse_comment(Tokenizer* self)
{
Py_ssize_t reset = self->head + 3;
PyObject *token, *comment;
int i;

self->head += 4;
comment = Tokenizer_parse(self, LC_COMMENT, 1);
if (BAD_ROUTE) {
const char* text = "<!--";
RESET_ROUTE();
self->head = reset;
i = 0;
while (1) {
if (!text[i])
return 0;
if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(text);
return -1;
}
i++;
}
if (Tokenizer_emit_string(self, "<!--"))
return -1;
return 0;
}
if (!comment)
@@ -1791,8 +1802,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
return (Tokenizer_emit_text(self, *"<") ||
Tokenizer_emit_text(self, *"/"));
return Tokenizer_emit_string(self, "</");
}
// Set invalid=True flag of TagOpenOpen
if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
@@ -1826,16 +1836,12 @@ static int Tokenizer_parse_tag(Tokenizer* self)
/*
Write the body of a tag and the tokens that should surround it.
*/
static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks,
PyObject* body)
static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
const char* ticks, PyObject* body)
{
PyObject *markup, *kwargs, *token;
char chr_markup[4];
int i;

for (i = 0; i < ticks; i++) chr_markup[i] = *"'";
chr_markup[ticks] = *"";
markup = PyBytes_FromString(chr_markup);
markup = PyBytes_FromString(ticks);
if (!markup)
return -1;
kwargs = PyDict_New();
@@ -1856,7 +1862,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks,
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, tag))
if (Tokenizer_emit_string(self, tag))
return -1;
token = PyObject_CallObject(TagCloseOpen, NULL);
if (!token)
@@ -1876,7 +1882,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks,
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, tag))
if (Tokenizer_emit_string(self, tag))
return -1;
token = PyObject_CallObject(TagCloseClose, NULL);
if (!token)
@@ -1907,15 +1913,12 @@ static int Tokenizer_parse_italics(Tokenizer* self)
context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS;
stack = Tokenizer_parse(self, context, 1);
}
else {
if (Tokenizer_emit_text(self, *"'"))
return -1;
return Tokenizer_emit_text(self, *"'");
}
else
return Tokenizer_emit_string(self, "''");
}
if (!stack)
return -1;
return Tokenizer_emit_style_tag(self, *"i", 2, stack);
return Tokenizer_emit_style_tag(self, "i", "''", stack);
}

/*
@@ -1934,11 +1937,7 @@ static int Tokenizer_parse_bold(Tokenizer* self)
return Tokenizer_emit_text(self, *"'") ? -1 : 1;
if (self->topstack->context & LC_STYLE_ITALICS) {
self->topstack->context |= LC_STYLE_PASS_AGAIN;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
return Tokenizer_emit_text(self, *"'");
return Tokenizer_emit_string(self, "'''");
}
if (Tokenizer_emit_text(self, *"'"))
return -1;
@@ -1946,7 +1945,7 @@ static int Tokenizer_parse_bold(Tokenizer* self)
}
if (!stack)
return -1;
return Tokenizer_emit_style_tag(self, *"b", 3, stack);
return Tokenizer_emit_style_tag(self, "b", "'''", stack);
}

/*
@@ -1965,15 +1964,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
return Tokenizer_emit_text(self, *"'");
return Tokenizer_emit_string(self, "'''''");
}
if (!stack)
return -1;
@@ -1982,19 +1973,15 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
if (Tokenizer_emit_string(self, "'''"))
return -1;
return Tokenizer_emit_style_tag(self, *"i", 2, stack);
return Tokenizer_emit_style_tag(self, "i", "''", stack);
}
if (!stack2)
return -1;
if (Tokenizer_push(self, 0))
return -1;
if (Tokenizer_emit_style_tag(self, *"i", 2, stack))
if (Tokenizer_emit_style_tag(self, "i", "''", stack))
return -1;
if (Tokenizer_emit_all(self, stack2))
return -1;
@@ -2002,7 +1989,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
stack2 = Tokenizer_pop(self);
if (!stack2)
return -1;
return Tokenizer_emit_style_tag(self, *"b", 3, stack2);
return Tokenizer_emit_style_tag(self, "b", "'''", stack2);
}
if (!stack)
return -1;
@@ -2011,17 +1998,15 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, *"'"))
if (Tokenizer_emit_string(self, "''"))
return -1;
if (Tokenizer_emit_text(self, *"'"))
return -1;
return Tokenizer_emit_style_tag(self, *"b", 3, stack);
return Tokenizer_emit_style_tag(self, "b", "'''", stack);
}
if (!stack2)
return -1;
if (Tokenizer_push(self, 0))
return -1;
if (Tokenizer_emit_style_tag(self, *"b", 3, stack))
if (Tokenizer_emit_style_tag(self, "b", "'''", stack))
return -1;
if (Tokenizer_emit_all(self, stack2))
return -1;
@@ -2029,7 +2014,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
stack2 = Tokenizer_pop(self);
if (!stack2)
return -1;
return Tokenizer_emit_style_tag(self, *"i", 2, stack2);
return Tokenizer_emit_style_tag(self, "i", "''", stack2);
}

/*
@@ -2103,8 +2088,6 @@ static int Tokenizer_handle_list_marker(Tokenizer* self)
{
PyObject *markup = Tokenizer_read(self, 0), *kwargs, *token;
Py_UNICODE code = *PyUnicode_AS_UNICODE(markup);
char *html;
int i = 0;

if (code == *";")
self->topstack->context |= LC_DLTERM;
@@ -2123,12 +2106,8 @@ static int Tokenizer_handle_list_marker(Tokenizer* self)
return -1;
}
Py_DECREF(token);
html = GET_HTML_TAG(code);
while (html[i]) {
if (Tokenizer_emit_text(self, html[i]))
return -1;
i++;
}
if (Tokenizer_emit_string(self, GET_HTML_TAG(code)))
return -1;
token = PyObject_CallObject(TagCloseSelfclose, NULL);
if (!token)
return -1;
@@ -2200,9 +2179,7 @@ static int Tokenizer_handle_hr(Tokenizer* self)
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, *"h"))
return -1;
if (Tokenizer_emit_text(self, *"r"))
if (Tokenizer_emit_string(self, "hr"))
return -1;
token = PyObject_CallObject(TagCloseSelfclose, NULL);
if (!token)


Caricamento…
Annulla
Salva