Browse Source

Tokenizer_parse_bracketed/free_uri_scheme(), other adjustments

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
a1948b06aa
2 changed files with 304 additions and 23 deletions
  1. +298
    -17
      mwparserfromhell/parser/tokenizer.c
  2. +6
    -6
      mwparserfromhell/parser/tokenizer.h

+ 298
- 17
mwparserfromhell/parser/tokenizer.c View File

@@ -51,13 +51,13 @@ static int heading_level_from_context(int n)
}

/*
Call the given function in definitions.py, using 'input' as a parameter,
and return its output as a bool.
Call the given function in definitions.py, using 'in1' and 'in2' as
parameters, and return its output as a bool.
*/
static int call_def_func(const char* funcname, PyObject* input)
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2)
{
PyObject* func = PyObject_GetAttrString(definitions, funcname);
PyObject* result = PyObject_CallFunctionObjArgs(func, input, NULL);
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL);
int ans = (result == Py_True) ? 1 : 0;

Py_DECREF(func);
@@ -129,8 +129,7 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
new->next = self;
*this = self = new;
}
self->data[self->size] = code;
self->size++;
self->data[self->size++] = code;
return 0;
}

@@ -822,6 +821,288 @@ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self)
}

/*
Parse the URI scheme of a bracket-enclosed external link.
*/
static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
{
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
Textbuffer* buffer;
PyObject* scheme;
Py_UNICODE this;
int slashes, i;

if (Tokenizer_push(self, LC_EXT_LINK_URI))
return -1;
if (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/") {
if (Tokenizer_emit_text(self, "//"))
return -1;
self->head += 2;
}
else {
buffer = Textbuffer_new();
if (!buffer)
return -1;
while ((this = Tokenizer_READ(self, 0)) != *"") {
i = 0;
while (1) {
if (!valid[i])
goto end_of_loop;
if (this == valid[i])
break;
i++;
}
Textbuffer_write(&buffer, this);
if (Tokenizer_emit_char(self, this)) {
Textbuffer_dealloc(buffer);
return -1;
}
self->head++;
}
end_of_loop:
if (this != *":") {
Textbuffer_dealloc(buffer);
Tokenizer_fail_route(self);
return 0;
}
if (Tokenizer_emit_char(self, *":")) {
Textbuffer_dealloc(buffer);
return -1;
}
self->head++;
slashes = (Tokenizer_READ(self, 0) == *"/" &&
Tokenizer_READ(self, 1) == *"/");
if (slashes) {
if (Tokenizer_emit_text(self, "//")) {
Textbuffer_dealloc(buffer);
return -1;
}
self->head += 2;
}
scheme = Textbuffer_render(buffer);
Textbuffer_dealloc(buffer);
if (!scheme)
return -1;
if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) {
Py_DECREF(scheme);
Tokenizer_fail_route(self);
return 0;
}
Py_DECREF(scheme);
}
return 0;
}

/*
Parse the URI scheme of a free (no brackets) external link.
*/
static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
{
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
PyObject *temp, *reversed, *scheme;
Py_UNICODE chunk;
int slashes, i, j;

if (!scheme_buffer)
return -1;
// We have to backtrack through the textbuffer looking for our scheme since
// it was just parsed as text:
temp_buffer = self->topstack->textbuffer;
while (temp_buffer) {
for (i = temp_buffer->size - 1; i >= 0; i++) {
chunk = temp_buffer->data[i];
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
goto end_of_loop;
j = 0;
while (1) {
if (!valid[j]) {
Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0);
return 0;
}
if (chunk == valid[j])
break;
j++;
}
Textbuffer_write(&scheme_buffer, chunk);
}
temp_buffer = temp_buffer->next;
}
end_of_loop:
temp = Textbuffer_render(scheme_buffer);
if (!temp) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
reversed = PyDict_GetItemString(PyEval_GetBuiltins(), "reversed");
scheme = PyObject_CallFunctionObjArgs(reversed, temp, NULL);
Py_DECREF(temp);
if (!scheme) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
slashes = (Tokenizer_READ(self, 0) == *"/" &&
Tokenizer_READ(self, 1) == *"/");
if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) {
Py_DECREF(scheme);
Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0);
return 0;
}
Py_DECREF(scheme);
if (Tokenizer_push(self, LC_EXT_LINK_URI)) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
while (temp_buffer) {
for (i = 0; i < temp_buffer->size; i++) {
if (Tokenizer_emit_char(self, temp_buffer->data[i])) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
}
temp_buffer = temp_buffer->next;
}
Textbuffer_dealloc(scheme_buffer);
if (Tokenizer_emit_char(self, *":"))
return -1;
if (slashes) {
if (Tokenizer_emit_text(self, "//"))
return -1;
self->head += 2;
}
}

/*
Handle text in a free external link, including trailing punctuation.
*/
static int
Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct,
Textbuffer** tail, Py_UNICODE this)
{
// if "(" in this and ")" in punct:
// punct = punct[:-1] # ')' is not longer valid punctuation
// if this.endswith(punct):
// for i in reversed(range(-len(this), 0)):
// if i == -len(this) or this[i - 1] not in punct:
// break
// stripped = this[:i]
// if stripped and tail:
// self._emit_text(tail)
// tail = ""
// tail += this[i:]
// this = stripped
// elif tail:
// self._emit_text(tail)
// tail = ""
// self._emit_text(this)
// return punct, tail
}

/*
Really parse an external link.
*/
static PyObject*
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets)
{
// if brackets:
// self._parse_bracketed_uri_scheme()
// invalid = ("\n", " ", "]")
// else:
// self._parse_free_uri_scheme()
// invalid = ("\n", " ", "[", "]")
// punct = tuple(",;\.:!?)")
// if self._read() is self.END or self._read()[0] in invalid:
// self._fail_route()
// tail = ""
// while True:
// this, next = self._read(), self._read(1)
// if this is self.END or this == "\n":
// if brackets:
// self._fail_route()
// return self._pop(), tail, -1
// elif this == next == "{" and self._can_recurse():
// if tail:
// self._emit_text(tail)
// tail = ""
// self._parse_template_or_argument()
// elif this == "[":
// if brackets:
// self._emit_text("[")
// else:
// return self._pop(), tail, -1
// elif this == "]":
// return self._pop(), tail, 0 if brackets else -1
// elif this == "&":
// if tail:
// self._emit_text(tail)
// tail = ""
// self._parse_entity()
// elif " " in this:
// before, after = this.split(" ", 1)
// if brackets:
// self._emit_text(before)
// self._emit(tokens.ExternalLinkSeparator())
// self._emit_text(after)
// self._context ^= contexts.EXT_LINK_URI
// self._context |= contexts.EXT_LINK_TITLE
// self._head += 1
// return self._parse(push=False), None, 0
// punct, tail = self._handle_free_link_text(punct, tail, before)
// return self._pop(), tail + " " + after, 0
// elif not brackets:
// punct, tail = self._handle_free_link_text(punct, tail, this)
// else:
// self._emit_text(this)
// self._head += 1
}

/*
Remove the URI scheme of a new external link from the textbuffer.
*/
static int
Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme)
{
// length = len(scheme)
// while length:
// if length < len(self._textbuffer[-1]):
// self._textbuffer[-1] = self._textbuffer[-1][:-length]
// break
// length -= len(self._textbuffer[-1])
// self._textbuffer.pop()
}

/*
Parse an external link at the head of the wikicode string.
*/
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
{
// reset = self._head
// self._head += 1
// try:
// bad_context = self._context & contexts.INVALID_LINK
// if bad_context or not self._can_recurse():
// raise BadRoute()
// link, extra, delta = self._really_parse_external_link(brackets)
// except BadRoute:
// self._head = reset
// if not brackets and self._context & contexts.DL_TERM:
// self._handle_dl_term()
// else:
// self._emit_text(self._read())
// else:
// if not brackets:
// scheme = link[0].text.split(":", 1)[0]
// self._remove_uri_scheme_from_textbuffer(scheme)
// self._emit(tokens.ExternalLinkOpen(brackets=brackets))
// self._emit_all(link)
// self._emit(tokens.ExternalLinkClose())
// self._head += delta
// if extra:
// self._emit_text(extra)
}

/*
Parse a section heading at the head of the wikicode string.
*/
static int Tokenizer_parse_heading(Tokenizer* self)
@@ -1978,9 +2259,9 @@ static int Tokenizer_handle_hr(Tokenizer* self)
self->head++;
}
markup = Textbuffer_render(buffer);
Textbuffer_dealloc(buffer);
if (!markup)
return -1;
Textbuffer_dealloc(buffer);
kwargs = PyDict_New();
if (!kwargs)
return -1;
@@ -2192,16 +2473,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
}
else if (this == next && next == *"]" && this_context & LC_WIKILINK)
return Tokenizer_handle_wikilink_end(self);
// else if (this == *"[") {
// if (Tokenizer_parse_external_link(self, 1))
// return NULL;
// }
// else if (this == *":" && !is_marker(last)) {
// if (Tokenizer_parse_external_link(self, 0))
// return NULL;
// }
// else if (this == *"]" && this_context & LC_EXT_LINK_TITLE)
// return Tokenizer_pop(self);
else if (this == *"[") {
if (Tokenizer_parse_external_link(self, 1))
return NULL;
}
else if (this == *":" && !is_marker(last)) {
if (Tokenizer_parse_external_link(self, 0))
return NULL;
}
else if (this == *"]" && this_context & LC_EXT_LINK_TITLE)
return Tokenizer_pop(self);
else if (this == *"=" && !(self->global & GL_HEADING)) {
if (last == *"\n" || last == *"") {
if (Tokenizer_parse_heading(self))


+ 6
- 6
mwparserfromhell/parser/tokenizer.h View File

@@ -122,7 +122,7 @@ static PyObject* TagCloseClose;
#define LC_WIKILINK_TEXT 0x00000040

#define LC_EXT_LINK 0x00000380
#define LC_EXT_LINK_URL 0x00000080
#define LC_EXT_LINK_URI 0x00000080
#define LC_EXT_LINK_TITLE 0x00000100
#define LC_EXT_LINK_BRACKETS 0x00000200

@@ -165,7 +165,7 @@ static PyObject* TagCloseClose;
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URL)
#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URI)

/* Tag contexts: */

@@ -241,10 +241,10 @@ typedef struct {
/* Macros for accessing definitions: */

#define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li")
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag))
#define IS_SINGLE(tag) (call_def_func("is_single", tag))
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag))
#define IS_SCHEME(scheme) (call_def_func("is_scheme", scheme))
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL))
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL))
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL))
#define IS_SCHEME(scheme, slashes) (call_def_func("is_scheme", scheme, slashes))


/* Function prototypes: */


Loading…
Cancel
Save