Browse Source

Clean up whitespace / newlines.

tags/v0.2
Ben Kurtovic 11 years ago
parent
commit
06f02b9753
1 changed files with 108 additions and 123 deletions
  1. +108
    -123
      mwparserfromhell/parser/tokenizer.c

+ 108
- 123
mwparserfromhell/parser/tokenizer.c View File

@@ -106,9 +106,8 @@ Tokenizer_push(Tokenizer* self, int context)
top->stack = PyList_New(0);
top->context = context;
top->textbuffer = Textbuffer_new();
if (!top->textbuffer) {
if (!top->textbuffer)
return -1;
}
top->next = self->topstack;
self->topstack = top;
return 0;
@@ -140,11 +139,11 @@ static int
Tokenizer_push_textbuffer(Tokenizer* self)
{
struct Textbuffer* buffer = self->topstack->textbuffer;
if (buffer->size == 0 && !buffer->next) {
if (buffer->size == 0 && !buffer->next)
return 0;
}
PyObject* text = Textbuffer_render(buffer);
if (!text) return -1;
if (!text)
return -1;

PyObject* kwargs = PyDict_New();
if (!kwargs) {
@@ -156,20 +155,19 @@ Tokenizer_push_textbuffer(Tokenizer* self)

PyObject* token = PyObject_Call(Text, NOARGS, kwargs);
Py_DECREF(kwargs);
if (!token) return -1;
if (!token)
return -1;

if (PyList_Append(self->topstack->stack, token)) {
Py_DECREF(token);
return -1;
}

Py_DECREF(token);

Textbuffer_dealloc(buffer);
self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer) {
if (!self->topstack->textbuffer)
return -1;
}
return 0;
}

@@ -239,10 +237,8 @@ Tokenizer_write(Tokenizer* self, PyObject* token)
{
if (Tokenizer_push_textbuffer(self))
return -1;

if (PyList_Append(self->topstack->stack, token))
return -1;

return 0;
}

@@ -254,10 +250,8 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token)
{
if (Tokenizer_push_textbuffer(self))
return -1;

if (PyList_Insert(self->topstack->stack, 0, token))
return -1;

return 0;
}

@@ -270,9 +264,8 @@ Tokenizer_write_text(Tokenizer* self, Py_UNICODE text)
struct Textbuffer* buf = self->topstack->textbuffer;
if (buf->size == TEXTBUFFER_BLOCKSIZE) {
struct Textbuffer* new = Textbuffer_new();
if (!new) {
if (!new)
return -1;
}
new->next = buf;
self->topstack->textbuffer = new;
buf = new;
@@ -297,18 +290,20 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
case 1: {
pushed = 1;
struct Textbuffer* buffer = self->topstack->textbuffer;
if (buffer->size == 0 && !buffer->next) {
if (buffer->size == 0 && !buffer->next)
break;
}
PyObject* left = Textbuffer_render(buffer);
if (!left) return -1;
if (!left)
return -1;
PyObject* right = PyObject_GetAttrString(token, "text");
if (!right) return -1;
if (!right)
return -1;

PyObject* text = PyUnicode_Concat(left, right);
Py_DECREF(left);
Py_DECREF(right);
if (!text) return -1;
if (!text)
return -1;

if (PyObject_SetAttrString(token, "text", text)) {
Py_DECREF(text);
@@ -318,9 +313,8 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)

Textbuffer_dealloc(buffer);
self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer) {
if (!self->topstack->textbuffer)
return -1;
}
break;
}
case -1:
@@ -334,10 +328,8 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)

PyObject* stack = self->topstack->stack;
Py_ssize_t size = PyList_GET_SIZE(stack);

if (PyList_SetSlice(stack, size, size, tokenlist))
return -1;

return 0;
}

@@ -351,7 +343,8 @@ Tokenizer_write_text_then_stack(Tokenizer* self, const char* text)
PyObject* stack = Tokenizer_pop(self);
int i = 0;
while (1) {
if (!text[i]) break;
if (!text[i])
break;
if (Tokenizer_write_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(stack);
return -1;
@@ -380,10 +373,8 @@ static PyObject*
Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index = self->head + delta;

if (index >= self->length)
return EMPTY;

return PyList_GET_ITEM(self->text, index);
}

@@ -395,7 +386,6 @@ Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{
if (delta > self->head)
return EMPTY;

Py_ssize_t index = self->head - delta;
return PyList_GET_ITEM(self->text, index);
}
@@ -457,21 +447,19 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
Py_XDECREF(text);
return 0;
}
else {
else
braces -= 2;
}
}
else {
else
braces -= 3;
}

if (braces) {
if (braces)
self->head++;
}
}

PyObject* tokenlist = Tokenizer_pop(self);
if (!tokenlist) return -1;
if (!tokenlist)
return -1;
if (Tokenizer_write_all(self, tokenlist)) {
Py_DECREF(tokenlist);
return -1;
@@ -495,7 +483,8 @@ Tokenizer_parse_template(Tokenizer* self)
self->head = reset;
return 0;
}
if (!template) return -1;
if (!template)
return -1;

token = PyObject_CallObject(TemplateOpen, NULL);
if (!token) {
@@ -517,14 +506,14 @@ Tokenizer_parse_template(Tokenizer* self)
Py_DECREF(template);

token = PyObject_CallObject(TemplateClose, NULL);
if (!token) return -1;
if (!token)
return -1;

if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);

return 0;
}

@@ -542,7 +531,8 @@ Tokenizer_parse_argument(Tokenizer* self)
self->head = reset;
return 0;
}
if (!argument) return -1;
if (!argument)
return -1;

token = PyObject_CallObject(ArgumentOpen, NULL);
if (!token) {
@@ -564,14 +554,14 @@ Tokenizer_parse_argument(Tokenizer* self)
Py_DECREF(argument);

token = PyObject_CallObject(ArgumentClose, NULL);
if (!token) return -1;
if (!token)
return -1;

if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);

return 0;
}

@@ -581,28 +571,27 @@ Tokenizer_parse_argument(Tokenizer* self)
static int
Tokenizer_handle_template_param(Tokenizer* self)
{
if (self->topstack->context & LC_TEMPLATE_NAME) {
if (self->topstack->context & LC_TEMPLATE_NAME)
self->topstack->context ^= LC_TEMPLATE_NAME;
}
else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) {
else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE)
self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE;
}

if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
PyObject* stack = Tokenizer_pop_keeping_context(self);
if (!stack) return -1;
if (!stack)
return -1;
if (Tokenizer_write_all(self, stack)) {
Py_DECREF(stack);
return -1;
}
Py_DECREF(stack);
}
else {
else
self->topstack->context |= LC_TEMPLATE_PARAM_KEY;
}

PyObject* token = PyObject_CallObject(TemplateParamSeparator, NULL);
if (!token) return -1;
if (!token)
return -1;

if (Tokenizer_write(self, token)) {
Py_DECREF(token);
@@ -622,7 +611,8 @@ static int
Tokenizer_handle_template_param_value(Tokenizer* self)
{
PyObject* stack = Tokenizer_pop_keeping_context(self);
if (!stack) return -1;
if (!stack)
return -1;
if (Tokenizer_write_all(self, stack)) {
Py_DECREF(stack);
return -1;
@@ -633,8 +623,8 @@ Tokenizer_handle_template_param_value(Tokenizer* self)
self->topstack->context |= LC_TEMPLATE_PARAM_VALUE;

PyObject* token = PyObject_CallObject(TemplateParamEquals, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -652,14 +642,14 @@ Tokenizer_handle_template_end(Tokenizer* self)
PyObject* stack;
if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
stack = Tokenizer_pop_keeping_context(self);
if (!stack) return NULL;
if (!stack)
return NULL;
if (Tokenizer_write_all(self, stack)) {
Py_DECREF(stack);
return NULL;
}
Py_DECREF(stack);
}

self->head++;
stack = Tokenizer_pop(self);
return stack;
@@ -675,8 +665,8 @@ Tokenizer_handle_argument_separator(Tokenizer* self)
self->topstack->context |= LC_ARGUMENT_DEFAULT;

PyObject* token = PyObject_CallObject(ArgumentSeparator, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -702,11 +692,12 @@ Tokenizer_handle_argument_end(Tokenizer* self)
static int
Tokenizer_parse_wikilink(Tokenizer* self)
{
self->head += 2;
Py_ssize_t reset = self->head - 1;
Py_ssize_t reset;
PyObject *token, *wikilink;

PyObject *token;
PyObject *wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE);
self->head += 2;
reset = self->head - 1;
wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE);

if (BAD_ROUTE) {
RESET_ROUTE();
@@ -718,7 +709,8 @@ Tokenizer_parse_wikilink(Tokenizer* self)
}
return 0;
}
if (!wikilink) return -1;
if (!wikilink)
return -1;

token = PyObject_CallObject(WikilinkOpen, NULL);
if (!token) {
@@ -740,8 +732,8 @@ Tokenizer_parse_wikilink(Tokenizer* self)
Py_DECREF(wikilink);

token = PyObject_CallObject(WikilinkClose, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -760,8 +752,8 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self)
self->topstack->context |= LC_WIKILINK_TEXT;

PyObject* token = PyObject_CallObject(WikilinkSeparator, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -866,14 +858,13 @@ Tokenizer_parse_heading(Tokenizer* self)
free(heading);

token = PyObject_CallObject(HeadingEnd, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);

self->global ^= GL_HEADING;
return 0;
}
@@ -931,7 +922,8 @@ Tokenizer_handle_heading_end(Tokenizer* self)
}

PyObject* stack = Tokenizer_pop(self);
if (!stack) return NULL;
if (!stack)
return NULL;

HeadingData* heading = malloc(sizeof(HeadingData));
if (!heading) {
@@ -955,7 +947,8 @@ Tokenizer_really_parse_entity(Tokenizer* self)
char *valid, *text, *def;

token = PyObject_CallObject(HTMLEntityStart, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -972,7 +965,8 @@ Tokenizer_really_parse_entity(Tokenizer* self)
if (this == *"#") {
numeric = 1;
token = PyObject_CallObject(HTMLEntityNumeric, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -988,11 +982,13 @@ Tokenizer_really_parse_entity(Tokenizer* self)
if (this == *"x" || this == *"X") {
hexadecimal = 1;
kwargs = PyDict_New();
if (!kwargs) return -1;
if (!kwargs)
return -1;
PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0));
PyObject* token = PyObject_Call(HTMLEntityHex, NOARGS, kwargs);
Py_DECREF(kwargs);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -1000,13 +996,11 @@ Tokenizer_really_parse_entity(Tokenizer* self)
Py_DECREF(token);
self->head++;
}
else {
else
hexadecimal = 0;
}
}
else {
else
numeric = hexadecimal = 0;
}

if (hexadecimal)
valid = "0123456789abcdefABCDEF";
@@ -1091,7 +1085,8 @@ Tokenizer_really_parse_entity(Tokenizer* self)
Py_DECREF(textobj);
token = PyObject_Call(Text, NOARGS, kwargs);
Py_DECREF(kwargs);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -1099,7 +1094,8 @@ Tokenizer_really_parse_entity(Tokenizer* self)
Py_DECREF(token);

token = PyObject_CallObject(HTMLEntityEnd, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -1117,9 +1113,8 @@ Tokenizer_parse_entity(Tokenizer* self)
Py_ssize_t reset = self->head;
if (Tokenizer_push(self, 0))
return -1;

if (Tokenizer_really_parse_entity(self))
return -1;
return -1;

if (BAD_ROUTE) {
RESET_ROUTE();
@@ -1130,12 +1125,12 @@ Tokenizer_parse_entity(Tokenizer* self)
}

PyObject* tokenlist = Tokenizer_pop(self);
if (!tokenlist) return -1;
if (!tokenlist)
return -1;
if (Tokenizer_write_all(self, tokenlist)) {
Py_DECREF(tokenlist);
return -1;
}

Py_DECREF(tokenlist);
return 0;
}
@@ -1158,7 +1153,8 @@ Tokenizer_parse_comment(Tokenizer* self)
const char* text = "<!--";
int i = 0;
while (1) {
if (!text[i]) return 0;
if (!text[i])
return 0;
if (Tokenizer_write_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(text);
return -1;
@@ -1167,7 +1163,8 @@ Tokenizer_parse_comment(Tokenizer* self)
}
return 0;
}
if (!comment) return -1;
if (!comment)
return -1;

token = PyObject_CallObject(CommentStart, NULL);
if (!token) {
@@ -1181,7 +1178,6 @@ Tokenizer_parse_comment(Tokenizer* self)
return -1;
}
Py_DECREF(token);

if (Tokenizer_write_all(self, comment)) {
Py_DECREF(comment);
return -1;
@@ -1189,8 +1185,8 @@ Tokenizer_parse_comment(Tokenizer* self)
Py_DECREF(comment);

token = PyObject_CallObject(CommentEnd, NULL);
if (!token) return -1;
if (!token)
return -1;
if (Tokenizer_write(self, token)) {
Py_DECREF(token);
return -1;
@@ -1232,12 +1228,10 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
}
self->topstack->context ^= LC_FAIL_ON_RBRACE;
}
else if (data == *"{") {
else if (data == *"{")
self->topstack->context |= LC_FAIL_ON_LBRACE;
}
else if (data == *"}") {
else if (data == *"}")
self->topstack->context |= LC_FAIL_ON_RBRACE;
}
}

if (context & LC_HAS_TEXT) {
@@ -1248,14 +1242,12 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
}
}
else {
if (data == *"\n") {
if (data == *"\n")
self->topstack->context |= LC_FAIL_ON_TEXT;
}
}
}
else if (!Py_UNICODE_ISSPACE(data)) {
else if (!Py_UNICODE_ISSPACE(data))
self->topstack->context |= LC_HAS_TEXT;
}
}

/*
@@ -1301,9 +1293,8 @@ Tokenizer_parse(Tokenizer* self, int context)
PyObject* trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
if (this_context & fail_contexts) {
if (this_context & fail_contexts)
return Tokenizer_fail_route(self);
}
return Tokenizer_pop(self);
}

@@ -1311,9 +1302,8 @@ Tokenizer_parse(Tokenizer* self, int context)

if (this_context & LC_COMMENT) {
if (this == next && next == *"-") {
if (Tokenizer_READ(self, 2) == *">") {
if (Tokenizer_READ(self, 2) == *">")
return Tokenizer_pop(self);
}
}
Tokenizer_write_text(self, this);
}
@@ -1331,9 +1321,8 @@ Tokenizer_parse(Tokenizer* self, int context)
if (Tokenizer_handle_template_param_value(self))
return NULL;
}
else if (this == next && next == *"}" && this_context & LC_TEMPLATE) {
else if (this == next && next == *"}" && this_context & LC_TEMPLATE)
return Tokenizer_handle_template_end(self);
}
else if (this == *"|" && this_context & LC_ARGUMENT_NAME) {
if (Tokenizer_handle_argument_separator(self))
return NULL;
@@ -1359,25 +1348,21 @@ Tokenizer_parse(Tokenizer* self, int context)
if (Tokenizer_handle_wikilink_separator(self))
return NULL;
}
else if (this == next && next == *"]" && this_context & LC_WIKILINK) {
else if (this == next && next == *"]" && this_context & LC_WIKILINK)
return Tokenizer_handle_wikilink_end(self);
}
else if (this == *"=" && !(self->global & GL_HEADING)) {
last = *PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1));
if (last == *"\n" || last == *"") {
if (Tokenizer_parse_heading(self))
return NULL;
}
else {
else
Tokenizer_write_text(self, this);
}
}
else if (this == *"=" && this_context & LC_HEADING) {
else if (this == *"=" && this_context & LC_HEADING)
return (PyObject*) Tokenizer_handle_heading_end(self);
}
else if (this == *"\n" && this_context & LC_HEADING) {
else if (this == *"\n" && this_context & LC_HEADING)
return Tokenizer_fail_route(self);
}
else if (this == *"&") {
if (Tokenizer_parse_entity(self))
return NULL;
@@ -1388,14 +1373,11 @@ Tokenizer_parse(Tokenizer* self, int context)
if (Tokenizer_parse_comment(self))
return NULL;
}
else {
else
Tokenizer_write_text(self, this);
}
}
else {
else
Tokenizer_write_text(self, this);
}

self->head++;
}
}
@@ -1414,9 +1396,8 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
const char* encoded;
Py_ssize_t size;

if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) {
if (!PyArg_ParseTuple(args, "s#", &encoded, &size))
return NULL;
}

PyObject* temp;
temp = PyUnicode_FromStringAndSize(encoded, size);
@@ -1434,7 +1415,6 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
}

self->length = PyList_GET_SIZE(self->text);

return Tokenizer_parse(self, 0);
}

@@ -1453,16 +1433,19 @@ init_tokenizer(void)
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);

PyObject* htmlentitydefs = PyImport_ImportModule("htmlentitydefs");
if (!htmlentitydefs) return;
if (!htmlentitydefs)
return;

PyObject* defmap = PyObject_GetAttrString(htmlentitydefs, "entitydefs");
if (!defmap) return;
if (!defmap)
return;
Py_DECREF(htmlentitydefs);

unsigned numdefs = (unsigned) PyDict_Size(defmap);
entitydefs = calloc(numdefs + 1, sizeof(char*));
PyObject* deflist = PyDict_Keys(defmap);
if (!deflist) return;
if (!deflist)
return;
Py_DECREF(defmap);

unsigned i;
@@ -1478,7 +1461,8 @@ init_tokenizer(void)
PyObject* globals = PyEval_GetGlobals();
PyObject* locals = PyEval_GetLocals();
PyObject* fromlist = PyList_New(1);
if (!fromlist) return;
if (!fromlist)
return;
PyObject* submodname = PyBytes_FromString("tokens");
if (!submodname) {
Py_DECREF(fromlist);
@@ -1488,7 +1472,8 @@ init_tokenizer(void)

PyObject* tokmodule = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tokmodule) return;
if (!tokmodule)
return;

tokens = PyObject_GetAttrString(tokmodule, "tokens");
Py_DECREF(tokmodule);


Loading…
Cancel
Save