Browse Source

Finish porting misc changes; add prototypes for remaining functions.

tags/v0.3
Ben Kurtovic 11 years ago
parent
commit
653071379b
2 changed files with 217 additions and 32 deletions
  1. +201
    -30
      mwparserfromhell/parser/tokenizer.c
  2. +16
    -2
      mwparserfromhell/parser/tokenizer.h

+ 201
- 30
mwparserfromhell/parser/tokenizer.c View File

@@ -487,6 +487,8 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
return -1;
}
Py_DECREF(tokenlist);
if (self->topstack->context & LC_FAIL_NEXT)
self->topstack->context ^= LC_FAIL_NEXT;
return 0;
}

@@ -499,7 +501,7 @@ Tokenizer_parse_template(Tokenizer* self)
PyObject *template, *token;
Py_ssize_t reset = self->head;

template = Tokenizer_parse(self, LC_TEMPLATE_NAME);
template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1);
if (BAD_ROUTE) {
self->head = reset;
return 0;
@@ -542,7 +544,7 @@ Tokenizer_parse_argument(Tokenizer* self)
PyObject *argument, *token;
Py_ssize_t reset = self->head;

argument = Tokenizer_parse(self, LC_ARGUMENT_NAME);
argument = Tokenizer_parse(self, LC_ARGUMENT_NAME, 1);
if (BAD_ROUTE) {
self->head = reset;
return 0;
@@ -709,7 +711,7 @@ Tokenizer_parse_wikilink(Tokenizer* self)

self->head += 2;
reset = self->head - 1;
wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE);
wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
@@ -745,6 +747,8 @@ Tokenizer_parse_wikilink(Tokenizer* self)
return -1;
}
Py_DECREF(token);
if (self->topstack->context & LC_FAIL_NEXT)
self->topstack->context ^= LC_FAIL_NEXT;
return 0;
}

@@ -797,7 +801,7 @@ Tokenizer_parse_heading(Tokenizer* self)
self->head++;
}
context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
heading = (HeadingData*) Tokenizer_parse(self, context);
heading = (HeadingData*) Tokenizer_parse(self, context, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset + best - 1;
@@ -886,7 +890,7 @@ Tokenizer_handle_heading_end(Tokenizer* self)
current = heading_level_from_context(self->topstack->context);
level = current > best ? (best > 6 ? 6 : best) :
(current > 6 ? 6 : current);
after = (HeadingData*) Tokenizer_parse(self, self->topstack->context);
after = (HeadingData*) Tokenizer_parse(self, self->topstack->context, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
if (level < best) {
@@ -1144,7 +1148,7 @@ Tokenizer_parse_comment(Tokenizer* self)
int i;

self->head += 4;
comment = Tokenizer_parse(self, LC_COMMENT);
comment = Tokenizer_parse(self, LC_COMMENT, 1);
if (BAD_ROUTE) {
const char* text = "<!--";
RESET_ROUTE();
@@ -1192,6 +1196,156 @@ Tokenizer_parse_comment(Tokenizer* self)
}

/*
Parse an HTML tag at the head of the wikicode string.
*/
static int
Tokenizer_parse_tag(Tokenizer* self)
{
return 0;
}

/*
Actually parse an HTML tag, starting with the open (<foo>).
*/
static PyObject*
Tokenizer_really_parse_tag(Tokenizer* self)
{
return NULL;
}

/*
Write a pending tag attribute from data to the stack.
*/
static int
Tokenizer_push_tag_buffer(Tokenizer* self, TagOpenData* data)
{
return 0;
}

/*
Handle all sorts of text data inside of an HTML open tag.
*/
static int
Tokenizer_handle_tag_data(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
{
return 0;
}

/*
Handle whitespace inside of an HTML open tag.
*/
static int
Tokenizer_handle_tag_space(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
{
return 0;
}

/*
Handle regular text inside of an HTML open tag.
*/
static int
Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
{
return 0;
}

/*
Handle the body of an HTML tag that is parser-blacklisted.
*/
static PyObject*
Tokenizer_handle_blacklisted_tag(Tokenizer* self)
{
return NULL;
}

/*
Handle the closing of a open tag (<foo>).
*/
static int
Tokenizer_handle_tag_close_open(Tokenizer* self, TagOpenData* data,
PyObject* token)
{
return 0;
}

/*
Handle the opening of a closing tag (</foo>).
*/
static int
Tokenizer_handle_tag_open_close(Tokenizer* self)
{
return 0;
}

/*
Handle the ending of a closing tag (</foo>).
*/
static PyObject*
Tokenizer_handle_tag_close_close(Tokenizer* self)
{
return NULL;
}

/*
Handle the (possible) start of an implicitly closing single tag.
*/
static int
Tokenizer_handle_invalid_tag_start(Tokenizer* self)
{
return 0;
}

/*
Handle the end of an implicitly closing single-only HTML tag.
*/
static PyObject*
Tokenizer_handle_single_only_tag_end(Tokenizer* self)
{
return NULL;
}

/*
Handle the stream end when inside a single-supporting HTML tag.
*/
static PyObject*
Tokenizer_handle_single_tag_end(Tokenizer* self)
{
return NULL;
}

/*
Handle the end of the stream of wikitext.
*/
static PyObject*
Tokenizer_handle_end(Tokenizer* self, int context)
{
static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
LC_HEADING | LC_COMMENT);
static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
PyObject *token, *text, *trash;
int single;

if (context & fail_contexts) {
if (context & LC_TAG_BODY) {
token = PyList_GET_ITEM(self->topstack->stack, 1);
text = PyObject_GetAttrString(token, "text");
if (!text)
return NULL;
single = IS_SINGLE(text);
Py_DECREF(text);
if (single)
return Tokenizer_handle_single_tag_end(self);
}
else if (context & double_fail) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
return Tokenizer_fail_route(self);
}
return Tokenizer_pop(self);
}

/*
Make sure we are not trying to write an invalid character. Return 0 if
everything is safe, or -1 if the route must be failed.
*/
@@ -1208,6 +1362,11 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
return -1;
return 0;
}
if (context & LC_TAG_CLOSE) {
if (data == *"<")
return -1;
return 0;
}
if (context & LC_TEMPLATE_NAME) {
if (data == *"{" || data == *"}" || data == *"[") {
self->topstack->context |= LC_FAIL_NEXT;
@@ -1267,32 +1426,33 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
}

/*
Parse the wikicode string, using context for when to stop.
Parse the wikicode string, using context for when to stop. If push is true,
we will push a new context, otherwise we won't and context will be ignored.
*/
static PyObject*
Tokenizer_parse(Tokenizer* self, int context)
Tokenizer_parse(Tokenizer* self, int context, int push)
{
static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
LC_HEADING | LC_COMMENT);
static int unsafe_contexts = (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE |
LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME);
static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
int this_context, is_marker, i;
Py_UNICODE this, next, next_next, last;
PyObject *trash;
PyObject* trash;

if (Tokenizer_push(self, context))
return NULL;
if (push) {
if (Tokenizer_push(self, context))
return NULL;
}
while (1) {
this = Tokenizer_READ(self, 0);
this_context = self->topstack->context;
if (this_context & unsafe_contexts) {
if (Tokenizer_verify_safe(self, this_context, this) < 0) {
if (this_context & LC_TEMPLATE_PARAM_KEY) {
if (this_context & double_unsafe) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
Tokenizer_fail_route(self);
return NULL;
return Tokenizer_fail_route(self);
}
}
is_marker = 0;
@@ -1307,15 +1467,8 @@ Tokenizer_parse(Tokenizer* self, int context)
self->head++;
continue;
}
if (this == *"") {
if (this_context & LC_TEMPLATE_PARAM_KEY) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
if (this_context & fail_contexts)
return Tokenizer_fail_route(self);
return Tokenizer_pop(self);
}
if (this == *"")
return Tokenizer_handle_end(self, this_context);
next = Tokenizer_READ(self, 1);
if (this_context & LC_COMMENT) {
if (this == next && next == *"-") {
@@ -1328,8 +1481,6 @@ Tokenizer_parse(Tokenizer* self, int context)
if (Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_template_or_argument(self))
return NULL;
if (self->topstack->context & LC_FAIL_NEXT)
self->topstack->context ^= LC_FAIL_NEXT;
}
else
Tokenizer_emit_text(self, this);
@@ -1359,8 +1510,6 @@ Tokenizer_parse(Tokenizer* self, int context)
Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_wikilink(self))
return NULL;
if (self->topstack->context & LC_FAIL_NEXT)
self->topstack->context ^= LC_FAIL_NEXT;
}
else
Tokenizer_emit_text(self, this);
@@ -1397,6 +1546,28 @@ Tokenizer_parse(Tokenizer* self, int context)
else
Tokenizer_emit_text(self, this);
}
else if (this == *"<" && next == *"/" &&
Tokenizer_READ(self, 2) != *"") {
if (this_context & LC_TAG_BODY) {
if (Tokenizer_handle_tag_open_close(self))
return NULL;
}
else {
if (Tokenizer_handle_invalid_tag_start(self))
return NULL;
}
}
else if (this == *"<") {
if (!(this_context & LC_TAG_CLOSE) &&
Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_tag(self))
return NULL;
}
else
Tokenizer_emit_text(self, this);
}
else if (this == *">" && this_context & LC_TAG_CLOSE)
return Tokenizer_handle_tag_close_close(self);
else
Tokenizer_emit_text(self, this);
self->head++;
@@ -1431,7 +1602,7 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
self->text = PySequence_Fast(text, "expected a sequence");
}
self->length = PyList_GET_SIZE(self->text);
return Tokenizer_parse(self, 0);
return Tokenizer_parse(self, 0, 1);
}

static void


+ 16
- 2
mwparserfromhell/parser/tokenizer.h View File

@@ -192,7 +192,7 @@ typedef struct Stack Stack;
typedef struct {
PyObject_HEAD
PyObject* text; /* text to tokenize */
struct Stack* topstack; /* topmost stack */
Stack* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
Py_ssize_t length; /* length of text */
int global; /* global context */
@@ -251,8 +251,22 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer*);
static int Tokenizer_really_parse_entity(Tokenizer*);
static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_really_parse_tag(Tokenizer*);
static int Tokenizer_push_tag_buffer(Tokenizer*, TagOpenData*);
static int Tokenizer_handle_tag_data(Tokenizer*, TagOpenData*, Py_UNICODE);
static int Tokenizer_handle_tag_space(Tokenizer*, TagOpenData*, Py_UNICODE);
static int Tokenizer_handle_tag_text(Tokenizer*, Py_UNICODE);
static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer*);
static int Tokenizer_handle_tag_close_open(Tokenizer*, TagOpenData*, PyObject*);
static int Tokenizer_handle_tag_open_close(Tokenizer*);
static PyObject* Tokenizer_handle_tag_close_close(Tokenizer*);
static int Tokenizer_handle_invalid_tag_start(Tokenizer*);
static PyObject* Tokenizer_handle_single_only_tag_end(Tokenizer*);
static PyObject* Tokenizer_handle_single_tag_end(Tokenizer*);
static PyObject* Tokenizer_handle_end(Tokenizer*, int);
static int Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE);
static PyObject* Tokenizer_parse(Tokenizer*, int);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);




Loading…
Cancel
Save