Pārlūkot izejas kodu

Remove a bunch of white space.

tags/v0.2
Ben Kurtovic pirms 11 gadiem
vecāks
revīzija
88a2b95030
1 mainītis faili ar 65 papildinājumiem un 140 dzēšanām
  1. +65
    -140
      mwparserfromhell/parser/tokenizer.c

+ 65
- 140
mwparserfromhell/parser/tokenizer.c Parādīt failu

@@ -82,7 +82,6 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
static char* kwlist[] = {NULL}; static char* kwlist[] = {NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
return -1; return -1;

self->text = Py_None; self->text = Py_None;
Py_INCREF(Py_None); Py_INCREF(Py_None);
self->topstack = NULL; self->topstack = NULL;
@@ -144,7 +143,6 @@ Tokenizer_push_textbuffer(Tokenizer* self)
PyObject* text = Textbuffer_render(buffer); PyObject* text = Textbuffer_render(buffer);
if (!text) if (!text)
return -1; return -1;

PyObject* kwargs = PyDict_New(); PyObject* kwargs = PyDict_New();
if (!kwargs) { if (!kwargs) {
Py_DECREF(text); Py_DECREF(text);
@@ -152,18 +150,15 @@ Tokenizer_push_textbuffer(Tokenizer* self)
} }
PyDict_SetItemString(kwargs, "text", text); PyDict_SetItemString(kwargs, "text", text);
Py_DECREF(text); Py_DECREF(text);

PyObject* token = PyObject_Call(Text, NOARGS, kwargs); PyObject* token = PyObject_Call(Text, NOARGS, kwargs);
Py_DECREF(kwargs); Py_DECREF(kwargs);
if (!token) if (!token)
return -1; return -1;

if (PyList_Append(self->topstack->stack, token)) { if (PyList_Append(self->topstack->stack, token)) {
Py_DECREF(token); Py_DECREF(token);
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

Textbuffer_dealloc(buffer); Textbuffer_dealloc(buffer);
self->topstack->textbuffer = Textbuffer_new(); self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer) if (!self->topstack->textbuffer)
@@ -189,10 +184,8 @@ Tokenizer_pop(Tokenizer* self)
{ {
if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return NULL; return NULL;

PyObject* stack = self->topstack->stack; PyObject* stack = self->topstack->stack;
Py_INCREF(stack); Py_INCREF(stack);

Tokenizer_delete_top_of_stack(self); Tokenizer_delete_top_of_stack(self);
return stack; return stack;
} }
@@ -206,11 +199,9 @@ Tokenizer_pop_keeping_context(Tokenizer* self)
{ {
if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return NULL; return NULL;

PyObject* stack = self->topstack->stack; PyObject* stack = self->topstack->stack;
Py_INCREF(stack); Py_INCREF(stack);
int context = self->topstack->context; int context = self->topstack->context;

Tokenizer_delete_top_of_stack(self); Tokenizer_delete_top_of_stack(self);
self->topstack->context = context; self->topstack->context = context;
return stack; return stack;
@@ -282,35 +273,36 @@ static int
Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist) Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
{ {
int pushed = 0; int pushed = 0;
PyObject *stack, *token, *left, *right, *text;
struct Textbuffer* buffer;
Py_ssize_t size;

if (PyList_GET_SIZE(tokenlist) > 0) { if (PyList_GET_SIZE(tokenlist) > 0) {
PyObject* token = PyList_GET_ITEM(tokenlist, 0);
token = PyList_GET_ITEM(tokenlist, 0);
switch (PyObject_IsInstance(token, Text)) { switch (PyObject_IsInstance(token, Text)) {
case 0: case 0:
break; break;
case 1: { case 1: {
pushed = 1; pushed = 1;
struct Textbuffer* buffer = self->topstack->textbuffer;
buffer = self->topstack->textbuffer;
if (buffer->size == 0 && !buffer->next) if (buffer->size == 0 && !buffer->next)
break; break;
PyObject* left = Textbuffer_render(buffer);
left = Textbuffer_render(buffer);
if (!left) if (!left)
return -1; return -1;
PyObject* right = PyObject_GetAttrString(token, "text");
right = PyObject_GetAttrString(token, "text");
if (!right) if (!right)
return -1; return -1;

PyObject* text = PyUnicode_Concat(left, right);
text = PyUnicode_Concat(left, right);
Py_DECREF(left); Py_DECREF(left);
Py_DECREF(right); Py_DECREF(right);
if (!text) if (!text)
return -1; return -1;

if (PyObject_SetAttrString(token, "text", text)) { if (PyObject_SetAttrString(token, "text", text)) {
Py_DECREF(text); Py_DECREF(text);
return -1; return -1;
} }
Py_DECREF(text); Py_DECREF(text);

Textbuffer_dealloc(buffer); Textbuffer_dealloc(buffer);
self->topstack->textbuffer = Textbuffer_new(); self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer) if (!self->topstack->textbuffer)
@@ -325,9 +317,8 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return -1; return -1;
} }

PyObject* stack = self->topstack->stack;
Py_ssize_t size = PyList_GET_SIZE(stack);
stack = self->topstack->stack;
size = PyList_GET_SIZE(stack);
if (PyList_SetSlice(stack, size, size, tokenlist)) if (PyList_SetSlice(stack, size, size, tokenlist))
return -1; return -1;
return 0; return 0;
@@ -351,7 +342,6 @@ Tokenizer_write_text_then_stack(Tokenizer* self, const char* text)
} }
i++; i++;
} }

if (stack) { if (stack) {
if (PyList_GET_SIZE(stack) > 0) { if (PyList_GET_SIZE(stack) > 0) {
if (Tokenizer_write_all(self, stack)) { if (Tokenizer_write_all(self, stack)) {
@@ -361,7 +351,6 @@ Tokenizer_write_text_then_stack(Tokenizer* self, const char* text)
} }
Py_DECREF(stack); Py_DECREF(stack);
} }

self->head--; self->head--;
return 0; return 0;
} }
@@ -396,23 +385,22 @@ Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
static int static int
Tokenizer_parse_template_or_argument(Tokenizer* self) Tokenizer_parse_template_or_argument(Tokenizer* self)
{ {
self->head += 2;
unsigned int braces = 2, i; unsigned int braces = 2, i;
PyObject *tokenlist;


self->head += 2;
while (Tokenizer_READ(self, 0) == *"{") { while (Tokenizer_READ(self, 0) == *"{") {
self->head++; self->head++;
braces++; braces++;
} }
if (Tokenizer_push(self, 0)) if (Tokenizer_push(self, 0))
return -1; return -1;

while (braces) { while (braces) {
if (braces == 1) { if (braces == 1) {
if (Tokenizer_write_text_then_stack(self, "{")) if (Tokenizer_write_text_then_stack(self, "{"))
return -1; return -1;
return 0; return 0;
} }

if (braces == 2) { if (braces == 2) {
if (Tokenizer_parse_template(self)) if (Tokenizer_parse_template(self))
return -1; return -1;
@@ -425,15 +413,12 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
} }
break; break;
} }

if (Tokenizer_parse_argument(self)) if (Tokenizer_parse_argument(self))
return -1; return -1;

if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
if (Tokenizer_parse_template(self)) if (Tokenizer_parse_template(self))
return -1; return -1;

if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
char text[braces + 1]; char text[braces + 1];
@@ -443,7 +428,6 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
Py_XDECREF(text); Py_XDECREF(text);
return -1; return -1;
} }

Py_XDECREF(text); Py_XDECREF(text);
return 0; return 0;
} }
@@ -452,19 +436,16 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
} }
else else
braces -= 3; braces -= 3;

if (braces) if (braces)
self->head++; self->head++;
} }

PyObject* tokenlist = Tokenizer_pop(self);
tokenlist = Tokenizer_pop(self);
if (!tokenlist) if (!tokenlist)
return -1; return -1;
if (Tokenizer_write_all(self, tokenlist)) { if (Tokenizer_write_all(self, tokenlist)) {
Py_DECREF(tokenlist); Py_DECREF(tokenlist);
return -1; return -1;
} }

Py_DECREF(tokenlist); Py_DECREF(tokenlist);
return 0; return 0;
} }
@@ -485,30 +466,25 @@ Tokenizer_parse_template(Tokenizer* self)
} }
if (!template) if (!template)
return -1; return -1;

token = PyObject_CallObject(TemplateOpen, NULL); token = PyObject_CallObject(TemplateOpen, NULL);
if (!token) { if (!token) {
Py_DECREF(template); Py_DECREF(template);
return -1; return -1;
} }

if (Tokenizer_write_first(self, token)) { if (Tokenizer_write_first(self, token)) {
Py_DECREF(token); Py_DECREF(token);
Py_DECREF(template); Py_DECREF(template);
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

if (Tokenizer_write_all(self, template)) { if (Tokenizer_write_all(self, template)) {
Py_DECREF(template); Py_DECREF(template);
return -1; return -1;
} }
Py_DECREF(template); Py_DECREF(template);

token = PyObject_CallObject(TemplateClose, NULL); token = PyObject_CallObject(TemplateClose, NULL);
if (!token) if (!token)
return -1; return -1;

if (Tokenizer_write(self, token)) { if (Tokenizer_write(self, token)) {
Py_DECREF(token); Py_DECREF(token);
return -1; return -1;
@@ -533,30 +509,25 @@ Tokenizer_parse_argument(Tokenizer* self)
} }
if (!argument) if (!argument)
return -1; return -1;

token = PyObject_CallObject(ArgumentOpen, NULL); token = PyObject_CallObject(ArgumentOpen, NULL);
if (!token) { if (!token) {
Py_DECREF(argument); Py_DECREF(argument);
return -1; return -1;
} }

if (Tokenizer_write_first(self, token)) { if (Tokenizer_write_first(self, token)) {
Py_DECREF(token); Py_DECREF(token);
Py_DECREF(argument); Py_DECREF(argument);
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

if (Tokenizer_write_all(self, argument)) { if (Tokenizer_write_all(self, argument)) {
Py_DECREF(argument); Py_DECREF(argument);
return -1; return -1;
} }
Py_DECREF(argument); Py_DECREF(argument);

token = PyObject_CallObject(ArgumentClose, NULL); token = PyObject_CallObject(ArgumentClose, NULL);
if (!token) if (!token)
return -1; return -1;

if (Tokenizer_write(self, token)) { if (Tokenizer_write(self, token)) {
Py_DECREF(token); Py_DECREF(token);
return -1; return -1;
@@ -571,13 +542,14 @@ Tokenizer_parse_argument(Tokenizer* self)
static int static int
Tokenizer_handle_template_param(Tokenizer* self) Tokenizer_handle_template_param(Tokenizer* self)
{ {
PyObject *stack, *token;

if (self->topstack->context & LC_TEMPLATE_NAME) if (self->topstack->context & LC_TEMPLATE_NAME)
self->topstack->context ^= LC_TEMPLATE_NAME; self->topstack->context ^= LC_TEMPLATE_NAME;
else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE)
self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE;

if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
PyObject* stack = Tokenizer_pop_keeping_context(self);
stack = Tokenizer_pop_keeping_context(self);
if (!stack) if (!stack)
return -1; return -1;
if (Tokenizer_write_all(self, stack)) { if (Tokenizer_write_all(self, stack)) {
@@ -589,16 +561,14 @@ Tokenizer_handle_template_param(Tokenizer* self)
else else
self->topstack->context |= LC_TEMPLATE_PARAM_KEY; self->topstack->context |= LC_TEMPLATE_PARAM_KEY;


PyObject* token = PyObject_CallObject(TemplateParamSeparator, NULL);
token = PyObject_CallObject(TemplateParamSeparator, NULL);
if (!token) if (!token)
return -1; return -1;

if (Tokenizer_write(self, token)) { if (Tokenizer_write(self, token)) {
Py_DECREF(token); Py_DECREF(token);
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

if (Tokenizer_push(self, self->topstack->context)) if (Tokenizer_push(self, self->topstack->context))
return -1; return -1;
return 0; return 0;
@@ -610,7 +580,9 @@ Tokenizer_handle_template_param(Tokenizer* self)
static int static int
Tokenizer_handle_template_param_value(Tokenizer* self) Tokenizer_handle_template_param_value(Tokenizer* self)
{ {
PyObject* stack = Tokenizer_pop_keeping_context(self);
PyObject *stack, *token;

stack = Tokenizer_pop_keeping_context(self);
if (!stack) if (!stack)
return -1; return -1;
if (Tokenizer_write_all(self, stack)) { if (Tokenizer_write_all(self, stack)) {
@@ -618,11 +590,9 @@ Tokenizer_handle_template_param_value(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(stack); Py_DECREF(stack);

self->topstack->context ^= LC_TEMPLATE_PARAM_KEY; self->topstack->context ^= LC_TEMPLATE_PARAM_KEY;
self->topstack->context |= LC_TEMPLATE_PARAM_VALUE; self->topstack->context |= LC_TEMPLATE_PARAM_VALUE;

PyObject* token = PyObject_CallObject(TemplateParamEquals, NULL);
token = PyObject_CallObject(TemplateParamEquals, NULL);
if (!token) if (!token)
return -1; return -1;
if (Tokenizer_write(self, token)) { if (Tokenizer_write(self, token)) {
@@ -640,6 +610,7 @@ static PyObject*
Tokenizer_handle_template_end(Tokenizer* self) Tokenizer_handle_template_end(Tokenizer* self)
{ {
PyObject* stack; PyObject* stack;

if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
stack = Tokenizer_pop_keeping_context(self); stack = Tokenizer_pop_keeping_context(self);
if (!stack) if (!stack)
@@ -663,7 +634,6 @@ Tokenizer_handle_argument_separator(Tokenizer* self)
{ {
self->topstack->context ^= LC_ARGUMENT_NAME; self->topstack->context ^= LC_ARGUMENT_NAME;
self->topstack->context |= LC_ARGUMENT_DEFAULT; self->topstack->context |= LC_ARGUMENT_DEFAULT;

PyObject* token = PyObject_CallObject(ArgumentSeparator, NULL); PyObject* token = PyObject_CallObject(ArgumentSeparator, NULL);
if (!token) if (!token)
return -1; return -1;
@@ -693,16 +663,15 @@ static int
Tokenizer_parse_wikilink(Tokenizer* self) Tokenizer_parse_wikilink(Tokenizer* self)
{ {
Py_ssize_t reset; Py_ssize_t reset;
PyObject *token, *wikilink;
PyObject *wikilink, *token;
int i;


self->head += 2; self->head += 2;
reset = self->head - 1; reset = self->head - 1;
wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE); wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE);

if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
int i;
for (i = 0; i < 2; i++) { for (i = 0; i < 2; i++) {
if (Tokenizer_write_text(self, *"[")) if (Tokenizer_write_text(self, *"["))
return -1; return -1;
@@ -711,26 +680,22 @@ Tokenizer_parse_wikilink(Tokenizer* self)
} }
if (!wikilink) if (!wikilink)
return -1; return -1;

token = PyObject_CallObject(WikilinkOpen, NULL); token = PyObject_CallObject(WikilinkOpen, NULL);
if (!token) { if (!token) {
Py_DECREF(wikilink); Py_DECREF(wikilink);
return -1; return -1;
} }

if (Tokenizer_write(self, token)) { if (Tokenizer_write(self, token)) {
Py_DECREF(token); Py_DECREF(token);
Py_DECREF(wikilink); Py_DECREF(wikilink);
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

if (Tokenizer_write_all(self, wikilink)) { if (Tokenizer_write_all(self, wikilink)) {
Py_DECREF(wikilink); Py_DECREF(wikilink);
return -1; return -1;
} }
Py_DECREF(wikilink); Py_DECREF(wikilink);

token = PyObject_CallObject(WikilinkClose, NULL); token = PyObject_CallObject(WikilinkClose, NULL);
if (!token) if (!token)
return -1; return -1;
@@ -750,7 +715,6 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self)
{ {
self->topstack->context ^= LC_WIKILINK_TITLE; self->topstack->context ^= LC_WIKILINK_TITLE;
self->topstack->context |= LC_WIKILINK_TEXT; self->topstack->context |= LC_WIKILINK_TEXT;

PyObject* token = PyObject_CallObject(WikilinkSeparator, NULL); PyObject* token = PyObject_CallObject(WikilinkSeparator, NULL);
if (!token) if (!token)
return -1; return -1;
@@ -779,19 +743,19 @@ Tokenizer_handle_wikilink_end(Tokenizer* self)
static int static int
Tokenizer_parse_heading(Tokenizer* self) Tokenizer_parse_heading(Tokenizer* self)
{ {
self->global |= GL_HEADING;
Py_ssize_t reset = self->head; Py_ssize_t reset = self->head;
self->head += 1;
int best = 1, i;
int best = 1, i, context, diff;
HeadingData *heading;
PyObject *level, *kwargs, *token;


self->global |= GL_HEADING;
self->head += 1;
while (Tokenizer_READ(self, 0) == *"=") { while (Tokenizer_READ(self, 0) == *"=") {
best++; best++;
self->head++; self->head++;
} }

int context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
HeadingData* heading = (HeadingData*) Tokenizer_parse(self, context);

context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
heading = (HeadingData*) Tokenizer_parse(self, context);
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset + best - 1; self->head = reset + best - 1;
@@ -804,14 +768,13 @@ Tokenizer_parse_heading(Tokenizer* self)
return 0; return 0;
} }


PyObject* level = PyInt_FromSsize_t(heading->level);
level = PyInt_FromSsize_t(heading->level);
if (!level) { if (!level) {
Py_DECREF(heading->title); Py_DECREF(heading->title);
free(heading); free(heading);
return -1; return -1;
} }

PyObject* kwargs = PyDict_New();
kwargs = PyDict_New();
if (!kwargs) { if (!kwargs) {
Py_DECREF(level); Py_DECREF(level);
Py_DECREF(heading->title); Py_DECREF(heading->title);
@@ -820,15 +783,13 @@ Tokenizer_parse_heading(Tokenizer* self)
} }
PyDict_SetItemString(kwargs, "level", level); PyDict_SetItemString(kwargs, "level", level);
Py_DECREF(level); Py_DECREF(level);

PyObject* token = PyObject_Call(HeadingStart, NOARGS, kwargs);
token = PyObject_Call(HeadingStart, NOARGS, kwargs);
Py_DECREF(kwargs); Py_DECREF(kwargs);
if (!token) { if (!token) {
Py_DECREF(heading->title); Py_DECREF(heading->title);
free(heading); free(heading);
return -1; return -1;
} }

if (Tokenizer_write(self, token)) { if (Tokenizer_write(self, token)) {
Py_DECREF(token); Py_DECREF(token);
Py_DECREF(heading->title); Py_DECREF(heading->title);
@@ -836,9 +797,8 @@ Tokenizer_parse_heading(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

if (heading->level < best) { if (heading->level < best) {
int diff = best - heading->level;
diff = best - heading->level;
char difftext[diff + 1]; char difftext[diff + 1];
for (i = 0; i < diff; i++) difftext[i] = *"="; for (i = 0; i < diff; i++) difftext[i] = *"=";
difftext[diff] = *""; difftext[diff] = *"";
@@ -848,7 +808,6 @@ Tokenizer_parse_heading(Tokenizer* self)
return -1; return -1;
} }
} }

if (Tokenizer_write_all(self, heading->title)) { if (Tokenizer_write_all(self, heading->title)) {
Py_DECREF(heading->title); Py_DECREF(heading->title);
free(heading); free(heading);
@@ -856,7 +815,6 @@ Tokenizer_parse_heading(Tokenizer* self)
} }
Py_DECREF(heading->title); Py_DECREF(heading->title);
free(heading); free(heading);

token = PyObject_CallObject(HeadingEnd, NULL); token = PyObject_CallObject(HeadingEnd, NULL);
if (!token) if (!token)
return -1; return -1;
@@ -875,24 +833,24 @@ Tokenizer_parse_heading(Tokenizer* self)
static HeadingData* static HeadingData*
Tokenizer_handle_heading_end(Tokenizer* self) Tokenizer_handle_heading_end(Tokenizer* self)
{ {
Py_ssize_t reset = self->head;
self->head += 1;
Py_ssize_t best = 1;
int i;
Py_ssize_t reset = self->head, best;
int i, current, level, diff;
HeadingData *after, *heading;
PyObject *stack;


self->head += 1;
best = 1;
while (Tokenizer_READ(self, 0) == *"=") { while (Tokenizer_READ(self, 0) == *"=") {
best++; best++;
self->head++; self->head++;
} }

int current = log2(self->topstack->context / LC_HEADING_LEVEL_1) + 1;
int level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current);
HeadingData* after = (HeadingData*) Tokenizer_parse(self, self->topstack->context);

current = log2(self->topstack->context / LC_HEADING_LEVEL_1) + 1;
level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current);
after = (HeadingData*) Tokenizer_parse(self, self->topstack->context);
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
if (level < best) { if (level < best) {
int diff = best - level;
diff = best - level;
char difftext[diff + 1]; char difftext[diff + 1];
for (i = 0; i < diff; i++) difftext[i] = *"="; for (i = 0; i < diff; i++) difftext[i] = *"=";
difftext[diff] = *""; difftext[diff] = *"";
@@ -901,7 +859,6 @@ Tokenizer_handle_heading_end(Tokenizer* self)
} }
self->head = reset + best - 1; self->head = reset + best - 1;
} }

else { else {
char text[best + 1]; char text[best + 1];
for (i = 0; i < best; i++) text[i] = *"="; for (i = 0; i < best; i++) text[i] = *"=";
@@ -920,12 +877,10 @@ Tokenizer_handle_heading_end(Tokenizer* self)
level = after->level; level = after->level;
free(after); free(after);
} }

PyObject* stack = Tokenizer_pop(self);
stack = Tokenizer_pop(self);
if (!stack) if (!stack)
return NULL; return NULL;

HeadingData* heading = malloc(sizeof(HeadingData));
heading = malloc(sizeof(HeadingData));
if (!heading) { if (!heading) {
PyErr_NoMemory(); PyErr_NoMemory();
return NULL; return NULL;
@@ -946,6 +901,12 @@ Tokenizer_really_parse_entity(Tokenizer* self)
int numeric, hexadecimal, i, j, test; int numeric, hexadecimal, i, j, test;
char *valid, *text, *def; char *valid, *text, *def;


#define FAIL_ROUTE_AND_EXIT() { \
Tokenizer_fail_route(self); \
free(text); \
return 0; \
}

token = PyObject_CallObject(HTMLEntityStart, NULL); token = PyObject_CallObject(HTMLEntityStart, NULL);
if (!token) if (!token)
return -1; return -1;
@@ -954,9 +915,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

self->head++; self->head++;

this = Tokenizer_READ(self, 0); this = Tokenizer_READ(self, 0);
if (this == *"") { if (this == *"") {
Tokenizer_fail_route(self); Tokenizer_fail_route(self);
@@ -972,7 +931,6 @@ Tokenizer_really_parse_entity(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

self->head++; self->head++;
this = Tokenizer_READ(self, 0); this = Tokenizer_READ(self, 0);
if (this == *"") { if (this == *"") {
@@ -985,7 +943,7 @@ Tokenizer_really_parse_entity(Tokenizer* self)
if (!kwargs) if (!kwargs)
return -1; return -1;
PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0)); PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0));
PyObject* token = PyObject_Call(HTMLEntityHex, NOARGS, kwargs);
token = PyObject_Call(HTMLEntityHex, NOARGS, kwargs);
Py_DECREF(kwargs); Py_DECREF(kwargs);
if (!token) if (!token)
return -1; return -1;
@@ -1001,26 +959,17 @@ Tokenizer_really_parse_entity(Tokenizer* self)
} }
else else
numeric = hexadecimal = 0; numeric = hexadecimal = 0;

if (hexadecimal) if (hexadecimal)
valid = "0123456789abcdefABCDEF"; valid = "0123456789abcdefABCDEF";
else if (numeric) else if (numeric)
valid = "0123456789"; valid = "0123456789";
else else
valid = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; valid = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

text = malloc(MAX_ENTITY_SIZE * sizeof(char)); text = malloc(MAX_ENTITY_SIZE * sizeof(char));
if (!text) { if (!text) {
PyErr_NoMemory(); PyErr_NoMemory();
return -1; return -1;
} }

#define FAIL_ROUTE_AND_EXIT() { \
Tokenizer_fail_route(self); \
free(text); \
return 0; \
}

i = 0; i = 0;
while (1) { while (1) {
this = Tokenizer_READ(self, 0); this = Tokenizer_READ(self, 0);
@@ -1051,7 +1000,6 @@ Tokenizer_really_parse_entity(Tokenizer* self)
self->head++; self->head++;
i++; i++;
} }

if (numeric) { if (numeric) {
sscanf(text, (hexadecimal ? "%x" : "%d"), &test); sscanf(text, (hexadecimal ? "%x" : "%d"), &test);
if (test < 1 || test > 0x10FFFF) if (test < 1 || test > 0x10FFFF)
@@ -1068,14 +1016,12 @@ Tokenizer_really_parse_entity(Tokenizer* self)
i++; i++;
} }
} }

textobj = PyUnicode_FromString(text); textobj = PyUnicode_FromString(text);
if (!textobj) { if (!textobj) {
free(text); free(text);
return -1; return -1;
} }
free(text); free(text);

kwargs = PyDict_New(); kwargs = PyDict_New();
if (!kwargs) { if (!kwargs) {
Py_DECREF(textobj); Py_DECREF(textobj);
@@ -1092,7 +1038,6 @@ Tokenizer_really_parse_entity(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);

token = PyObject_CallObject(HTMLEntityEnd, NULL); token = PyObject_CallObject(HTMLEntityEnd, NULL);
if (!token) if (!token)
return -1; return -1;
@@ -1111,11 +1056,12 @@ static int
Tokenizer_parse_entity(Tokenizer* self) Tokenizer_parse_entity(Tokenizer* self)
{ {
Py_ssize_t reset = self->head; Py_ssize_t reset = self->head;
PyObject *tokenlist;

if (Tokenizer_push(self, 0)) if (Tokenizer_push(self, 0))
return -1; return -1;
if (Tokenizer_really_parse_entity(self)) if (Tokenizer_really_parse_entity(self))
return -1; return -1;

if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
@@ -1123,8 +1069,7 @@ Tokenizer_parse_entity(Tokenizer* self)
return -1; return -1;
return 0; return 0;
} }

PyObject* tokenlist = Tokenizer_pop(self);
tokenlist = Tokenizer_pop(self);
if (!tokenlist) if (!tokenlist)
return -1; return -1;
if (Tokenizer_write_all(self, tokenlist)) { if (Tokenizer_write_all(self, tokenlist)) {
@@ -1141,17 +1086,17 @@ Tokenizer_parse_entity(Tokenizer* self)
static int static int
Tokenizer_parse_comment(Tokenizer* self) Tokenizer_parse_comment(Tokenizer* self)
{ {
self->head += 4;
Py_ssize_t reset = self->head - 1;

PyObject *token;
PyObject *comment = Tokenizer_parse(self, LC_WIKILINK_TITLE);
Py_ssize_t reset = self->head + 3;
PyObject *token, *comment;
int i;


self->head += 4;
comment = Tokenizer_parse(self, LC_WIKILINK_TITLE);
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
const char* text = "<!--"; const char* text = "<!--";
int i = 0;
i = 0;
while (1) { while (1) {
if (!text[i]) if (!text[i])
return 0; return 0;
@@ -1165,13 +1110,11 @@ Tokenizer_parse_comment(Tokenizer* self)
} }
if (!comment) if (!comment)
return -1; return -1;

token = PyObject_CallObject(CommentStart, NULL); token = PyObject_CallObject(CommentStart, NULL);
if (!token) { if (!token) {
Py_DECREF(comment); Py_DECREF(comment);
return -1; return -1;
} }

if (Tokenizer_write(self, token)) { if (Tokenizer_write(self, token)) {
Py_DECREF(token); Py_DECREF(token);
Py_DECREF(comment); Py_DECREF(comment);
@@ -1183,7 +1126,6 @@ Tokenizer_parse_comment(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(comment); Py_DECREF(comment);

token = PyObject_CallObject(CommentEnd, NULL); token = PyObject_CallObject(CommentEnd, NULL);
if (!token) if (!token)
return -1; return -1;
@@ -1206,7 +1148,6 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
Tokenizer_fail_route(self); Tokenizer_fail_route(self);
return; return;
} }

if (context & (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE)) { if (context & (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE)) {
if (data == *"{" || data == *"}" || data == *"[" || data == *"]") { if (data == *"{" || data == *"}" || data == *"[" || data == *"]") {
self->topstack->context |= LC_FAIL_NEXT; self->topstack->context |= LC_FAIL_NEXT;
@@ -1233,7 +1174,6 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
else if (data == *"}") else if (data == *"}")
self->topstack->context |= LC_FAIL_ON_RBRACE; self->topstack->context |= LC_FAIL_ON_RBRACE;
} }

if (context & LC_HAS_TEXT) { if (context & LC_HAS_TEXT) {
if (context & LC_FAIL_ON_TEXT) { if (context & LC_FAIL_ON_TEXT) {
if (!Py_UNICODE_ISSPACE(data)) { if (!Py_UNICODE_ISSPACE(data)) {
@@ -1263,17 +1203,14 @@ Tokenizer_parse(Tokenizer* self, int context)


if (Tokenizer_push(self, context)) if (Tokenizer_push(self, context))
return NULL; return NULL;

while (1) { while (1) {
this = Tokenizer_READ(self, 0); this = Tokenizer_READ(self, 0);
this_context = self->topstack->context; this_context = self->topstack->context;

if (this_context & unsafe_contexts) { if (this_context & unsafe_contexts) {
Tokenizer_verify_safe(self, this_context, this); Tokenizer_verify_safe(self, this_context, this);
if (BAD_ROUTE) if (BAD_ROUTE)
return NULL; return NULL;
} }

is_marker = 0; is_marker = 0;
for (i = 0; i < NUM_MARKERS; i++) { for (i = 0; i < NUM_MARKERS; i++) {
if (*MARKERS[i] == this) { if (*MARKERS[i] == this) {
@@ -1281,13 +1218,11 @@ Tokenizer_parse(Tokenizer* self, int context)
break; break;
} }
} }

if (!is_marker) { if (!is_marker) {
Tokenizer_write_text(self, this); Tokenizer_write_text(self, this);
self->head++; self->head++;
continue; continue;
} }

if (this == *"") { if (this == *"") {
if (this_context & LC_TEMPLATE_PARAM_KEY) { if (this_context & LC_TEMPLATE_PARAM_KEY) {
PyObject* trash = Tokenizer_pop(self); PyObject* trash = Tokenizer_pop(self);
@@ -1297,9 +1232,7 @@ Tokenizer_parse(Tokenizer* self, int context)
return Tokenizer_fail_route(self); return Tokenizer_fail_route(self);
return Tokenizer_pop(self); return Tokenizer_pop(self);
} }

next = Tokenizer_READ(self, 1); next = Tokenizer_READ(self, 1);

if (this_context & LC_COMMENT) { if (this_context & LC_COMMENT) {
if (this == next && next == *"-") { if (this == next && next == *"-") {
if (Tokenizer_READ(self, 2) == *">") if (Tokenizer_READ(self, 2) == *">")
@@ -1388,22 +1321,18 @@ Tokenizer_parse(Tokenizer* self, int context)
static PyObject* static PyObject*
Tokenizer_tokenize(Tokenizer* self, PyObject* args) Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{ {
PyObject* text;
PyObject *text, *temp;


if (!PyArg_ParseTuple(args, "U", &text)) { if (!PyArg_ParseTuple(args, "U", &text)) {
/* Failed to parse a Unicode object; try a string instead. */ /* Failed to parse a Unicode object; try a string instead. */
PyErr_Clear(); PyErr_Clear();
const char* encoded; const char* encoded;
Py_ssize_t size; Py_ssize_t size;

if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) if (!PyArg_ParseTuple(args, "s#", &encoded, &size))
return NULL; return NULL;

PyObject* temp;
temp = PyUnicode_FromStringAndSize(encoded, size); temp = PyUnicode_FromStringAndSize(encoded, size);
if (!text) if (!text)
return NULL; return NULL;

Py_XDECREF(self->text); Py_XDECREF(self->text);
text = PySequence_Fast(temp, "expected a sequence"); text = PySequence_Fast(temp, "expected a sequence");
Py_XDECREF(temp); Py_XDECREF(temp);
@@ -1413,7 +1342,6 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
Py_XDECREF(self->text); Py_XDECREF(self->text);
self->text = PySequence_Fast(text, "expected a sequence"); self->text = PySequence_Fast(text, "expected a sequence");
} }

self->length = PyList_GET_SIZE(self->text); self->length = PyList_GET_SIZE(self->text);
return Tokenizer_parse(self, 0); return Tokenizer_parse(self, 0);
} }
@@ -1428,9 +1356,7 @@ init_tokenizer(void)
TokenizerType.tp_new = PyType_GenericNew; TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0) if (PyType_Ready(&TokenizerType) < 0)
return; return;

module = Py_InitModule("_tokenizer", module_methods); module = Py_InitModule("_tokenizer", module_methods);

Py_INCREF(&TokenizerType); Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);


@@ -1445,7 +1371,6 @@ init_tokenizer(void)
if (!deflist) if (!deflist)
return; return;
Py_DECREF(defmap); Py_DECREF(defmap);

numdefs = (unsigned) PyList_GET_SIZE(defmap); numdefs = (unsigned) PyList_GET_SIZE(defmap);
entitydefs = calloc(numdefs + 1, sizeof(char*)); entitydefs = calloc(numdefs + 1, sizeof(char*));
for (i = 0; i < numdefs; i++) for (i = 0; i < numdefs; i++)


Notiek ielāde…
Atcelt
Saglabāt