浏览代码

Fill out Tokenizer_parse(); build a bunch of empty function definitions.

tags/v0.2
Ben Kurtovic 12 年前
父节点
当前提交
8729d20f07
共有 1 个文件被更改,包括 240 次插入12 次删除
  1. +240
    -12
      mwparserfromhell/parser/tokenizer.c

+ 240
- 12
mwparserfromhell/parser/tokenizer.c 查看文件

@@ -26,8 +26,8 @@ SOFTWARE.
#endif

#include <Python.h>
#include "setjmp.h"
#include "structmember.h"
#include <setjmp.h>
#include <structmember.h>

static PyObject* EMPTY;

@@ -35,7 +35,10 @@ static PyObject* EMPTY;
static const Py_UNICODE* MARKERS[] = {PU"{", PU"}", PU"[", PU"]", PU"<", PU">",
PU"|", PU"=", PU"&", PU"#", PU"*", PU";",
PU":", PU"/", PU"-", PU"!", PU"\n", PU""};
#undef PU
static const int NUM_MARKERS = 17;

#define CONTEXT(name) PyInt_AsSsize_t((PyIntObject*) \
PyObject_GetAttrString(contexts, name))

static jmp_buf exception_env;
static const int BAD_ROUTE = 1;
@@ -103,6 +106,7 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)

#define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0)
#define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1)
#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t((PyIntObject*) Tokenizer_CONTEXT(self))
#define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2)

static int
@@ -125,11 +129,11 @@ Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value)
Add a new token stack, context, and textbuffer to the list.
*/
static int
Tokenizer_push(Tokenizer* self, int context)
Tokenizer_push(Tokenizer* self, Py_ssize_t context)
{
PyObject* top = PyList_New(3);
PyList_SET_ITEM(top, 0, PyList_New(0));
PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(0));
PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(context));
PyList_SET_ITEM(top, 2, PyList_New(0));

Py_XDECREF(self->topstack);
@@ -345,7 +349,7 @@ Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text)
}

/*
Read the value at a relative point in the wikicode.
Read the value at a relative point in the wikicode, forwards.
*/
static PyObject*
Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
@@ -360,23 +364,247 @@ Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
}

/*
Parse the wikicode string, using *context* for when to stop.
Read the value at a relative point in the wikicode, backwards.
*/
static PyObject*
Tokenizer_parse(Tokenizer* self, int context)
Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{
if (delta > self->head) {
return EMPTY;
}

Py_ssize_t index = self->head - delta;
return PySequence_Fast_GET_ITEM(self->text, index);
}

static int
Tokenizer_parse_template_or_argument(Tokenizer* self)
{

}

static int
Tokenizer_parse_template(Tokenizer* self)
{

}

static int
Tokenizer_parse_argument(Tokenizer* self)
{

}

static int
Tokenizer_verify_safe(Tokenizer* self)
{

}

static int
Tokenizer_handle_template_param(Tokenizer* self)
{

}

static int
Tokenizer_handle_template_param_value(Tokenizer* self)
{
PyObject* this;

}

static PyObject*
Tokenizer_handle_template_end(Tokenizer* self)
{

}

static int
Tokenizer_handle_argument_separator(Tokenizer* self)
{

}

static PyObject*
Tokenizer_handle_argument_end(Tokenizer* self)
{

}

static int
Tokenizer_parse_wikilink(Tokenizer* self)
{

}

static int
Tokenizer_handle_wikilink_separator(Tokenizer* self)
{

}

static PyObject*
Tokenizer_handle_wikilink_end(Tokenizer* self)
{

}

static int
Tokenizer_parse_heading(Tokenizer* self)
{

}

static PyObject*
Tokenizer_handle_heading_end(Tokenizer* self)
{

}

static int
Tokenizer_really_parse_entity(Tokenizer* self)
{

}

static int
Tokenizer_parse_entity(Tokenizer* self)
{

}

static int
Tokenizer_parse_comment(Tokenizer* self)
{

}


/*
Parse the wikicode string, using context for when to stop.
*/
static PyObject*
Tokenizer_parse(Tokenizer* self, Py_ssize_t context)
{
Py_ssize_t fail_contexts = (
CONTEXT("TEMPLATE") | CONTEXT("ARGUMENT") | CONTEXT("HEADING") |
CONTEXT("COMMENT"));

PyObject *this, *next;
Py_UNICODE *this_data, *next_data, *next_next_data, *last_data;
Py_ssize_t this_context;
int is_marker, i;

Tokenizer_push(self, context);

while (1) {
this = Tokenizer_read(self, 0);
/* if (this not in MARKERS) {
WRITE TEXT
} */
this_data = PyUnicode_AS_UNICODE(this);

is_marker = 0;
for (i = 0; i < NUM_MARKERS; i++) {
if (MARKERS[i] == this_data) {
is_marker = 1;
break;
}
}

if (!is_marker) {
Tokenizer_write_text(self, this);
self->head++;
continue;
}

this_context = Tokenizer_CONTEXT_VAL(self);

if (this == EMPTY) {
if (this_context & fail_contexts) {
Tokenizer_fail_route(self);
}
return Tokenizer_pop(self);
}

next = Tokenizer_read(self, 1);
next_data = PyUnicode_AS_UNICODE(next);

if (this_context & CONTEXT("COMMENT")) {
if (this_data == next_data && next_data == PU "-") {
if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU ">") {
return Tokenizer_pop(self);
}
}
Tokenizer_write_text(self, this);
}
else if (this_data == next_data && next_data == PU "{") {
Tokenizer_parse_template_or_argument(self);
}
else if (this_data == PU "|" && this_context & CONTEXT("TEMPLATE")) {
Tokenizer_handle_template_param(self);
}
else if (this_data == PU "=" && this_context & CONTEXT("TEMPLATE_PARAM_KEY")) {
Tokenizer_handle_template_param_value(self);
}
else if (this_data == next_data && next_data == PU "}" &&
this_context & CONTEXT("TEMPLATE")) {
Tokenizer_handle_template_end(self);
}
else if (this_data == PU "|" && this_context & CONTEXT("ARGUMENT_NAME")) {
Tokenizer_handle_argument_separator(self);
}
else if (this_data == next_data && next_data == PU "}" &&
this_context & CONTEXT("ARGUMENT")) {
if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU "}") {
return Tokenizer_handle_argument_end(self);
}
Tokenizer_write_text(self, this);
}
else if (this_data == next_data && next_data == PU "[") {
if (!(this_context & CONTEXT("WIKILINK_TITLE"))) {
Tokenizer_parse_wikilink(self);
}
else {
Tokenizer_write_text(self, this);
}
}
else if (this_data == PU "|" && this_context & CONTEXT("WIKILINK_TITLE")) {
Tokenizer_handle_wikilink_separator(self);
}
else if (this_data == next_data && next_data == PU "]" &&
this_context & CONTEXT("WIKILINK")) {
return Tokenizer_handle_wikilink_end(self);
}
else if (this_data == PU "=" && !(self->global & CONTEXT("GL_HEADING"))) {
last_data = PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1));
if (last_data == PU "\n" || last_data == PU "") {
Tokenizer_parse_heading(self);
}
else {
Tokenizer_write_text(self, this);
}
}
else if (this_data == PU "=" && this_context & CONTEXT("HEADING")) {
return Tokenizer_handle_heading_end(self);
}
else if (this_data == PU "\n" && this_context & CONTEXT("HEADING")) {
Tokenizer_fail_route(self);
}
else if (this_data == PU "&") {
Tokenizer_parse_entity(self);
}
else if (this_data == PU "<" && next_data == PU "!") {
next_next_data = PyUnicode_AS_UNICODE(Tokenizer_read(self, 2));
if (next_next_data == PyUnicode_AS_UNICODE(Tokenizer_read(self, 3)) &&
next_next_data == PU "-") {
Tokenizer_parse_comment(self);
}
else {
Tokenizer_write_text(self, this);
}
}
else {
Tokenizer_write_text(self, this);
}

self->head++;
}
}


正在加载...
取消
保存