diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index 2ed5a02..58c9487 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -20,14 +20,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#pragma once + #ifndef PY_SSIZE_T_CLEAN -#define PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html #endif #include #include #include +/* Compatibility macros */ + #if PY_MAJOR_VERSION >= 3 #define IS_PY3K #endif @@ -36,5 +40,53 @@ SOFTWARE. #define uint64_t unsigned PY_LONG_LONG #endif -#define malloc PyObject_Malloc +#define malloc PyObject_Malloc // XXX: yuck #define free PyObject_Free + +/* Error handling globals/macros */ + +extern int route_state; // TODO: this is NOT thread-safe! +extern uint64_t route_context; + +#define BAD_ROUTE route_state +#define BAD_ROUTE_CONTEXT route_context +#define FAIL_ROUTE(context) { route_state = 1; route_context = context; } +#define RESET_ROUTE() route_state = 0 + +/* Shared globals */ + +extern char** entitydefs; + +extern PyObject* EMPTY; +extern PyObject* NOARGS; +extern PyObject* definitions; + +/* Structs */ + +struct Textbuffer { + Py_ssize_t size; + Py_UNICODE* data; + struct Textbuffer* prev; + struct Textbuffer* next; +}; +typedef struct Textbuffer Textbuffer; + +struct Stack { + PyObject* stack; + uint64_t context; + struct Textbuffer* textbuffer; + struct Stack* next; +}; +typedef struct Stack Stack; + +typedef struct { + PyObject_HEAD + PyObject* text; /* text to tokenize */ + Stack* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + Py_ssize_t length; /* length of text */ + int global; /* global context */ + int depth; /* stack recursion depth */ + int cycles; /* total number of stack recursions */ + int skip_style_tags; /* temporary fix for the sometimes broken tag parser */ +} Tokenizer; diff --git a/mwparserfromhell/parser/ctokenizer/contexts.h b/mwparserfromhell/parser/ctokenizer/contexts.h new file mode 100644 index 0000000..8e24372 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/contexts.h @@ -0,0 +1,104 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +/* Local contexts */ + +#define LC_TEMPLATE 0x0000000000000007 +#define LC_TEMPLATE_NAME 0x0000000000000001 +#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 +#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 + +#define LC_ARGUMENT 0x0000000000000018 +#define LC_ARGUMENT_NAME 0x0000000000000008 +#define LC_ARGUMENT_DEFAULT 0x0000000000000010 + +#define LC_WIKILINK 0x0000000000000060 +#define LC_WIKILINK_TITLE 0x0000000000000020 +#define LC_WIKILINK_TEXT 0x0000000000000040 + +#define LC_EXT_LINK 0x0000000000000180 +#define LC_EXT_LINK_URI 0x0000000000000080 +#define LC_EXT_LINK_TITLE 0x0000000000000100 + +#define LC_HEADING 0x0000000000007E00 +#define LC_HEADING_LEVEL_1 0x0000000000000200 +#define LC_HEADING_LEVEL_2 0x0000000000000400 +#define LC_HEADING_LEVEL_3 0x0000000000000800 +#define LC_HEADING_LEVEL_4 0x0000000000001000 +#define LC_HEADING_LEVEL_5 0x0000000000002000 +#define LC_HEADING_LEVEL_6 0x0000000000004000 + +#define LC_TAG 0x0000000000078000 +#define LC_TAG_OPEN 0x0000000000008000 +#define LC_TAG_ATTR 0x0000000000010000 +#define LC_TAG_BODY 0x0000000000020000 +#define LC_TAG_CLOSE 0x0000000000040000 + +#define LC_STYLE 0x0000000000780000 +#define LC_STYLE_ITALICS 0x0000000000080000 +#define LC_STYLE_BOLD 0x0000000000100000 +#define LC_STYLE_PASS_AGAIN 0x0000000000200000 +#define LC_STYLE_SECOND_PASS 0x0000000000400000 + +#define LC_DLTERM 0x0000000000800000 + +#define LC_SAFETY_CHECK 0x000000003F000000 +#define LC_HAS_TEXT 0x0000000001000000 +#define LC_FAIL_ON_TEXT 0x0000000002000000 +#define LC_FAIL_NEXT 0x0000000004000000 +#define LC_FAIL_ON_LBRACE 0x0000000008000000 +#define LC_FAIL_ON_RBRACE 0x0000000010000000 +#define LC_FAIL_ON_EQUALS 0x0000000020000000 + +#define LC_TABLE 0x0000000FC0000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 +#define LC_TABLE_OPEN 0x0000000040000000 +#define LC_TABLE_CELL_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_STYLE 0x0000000100000000 +#define LC_TABLE_ROW_OPEN 0x0000000200000000 +#define LC_TABLE_TD_LINE 0x0000000400000000 +#define LC_TABLE_TH_LINE 0x0000000800000000 + +/* Global contexts */ + +#define GL_HEADING 0x1 + +/* Aggregate contexts */ + +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) +#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) +#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) + +/* Tag contexts */ + +#define TAG_NAME 0x01 +#define TAG_ATTR_READY 0x02 +#define TAG_ATTR_NAME 0x04 +#define TAG_ATTR_VALUE 0x08 +#define TAG_QUOTED 0x10 +#define TAG_NOTE_SPACE 0x20 +#define TAG_NOTE_EQUALS 0x40 +#define TAG_NOTE_QUOTE 0x80 diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.c b/mwparserfromhell/parser/ctokenizer/tag_data.c new file mode 100644 index 0000000..968a760 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tag_data.c @@ -0,0 +1,88 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "tag_data.h" +#include "contexts.h" + +/* + Initialize a new TagData object. +*/ +TagData* TagData_new(void) +{ +#define ALLOC_BUFFER(name) \ + name = Textbuffer_new(); \ + if (!name) { \ + TagData_dealloc(self); \ + return NULL; \ + } + + TagData *self = malloc(sizeof(TagData)); + if (!self) { + PyErr_NoMemory(); + return NULL; + } + self->context = TAG_NAME; + ALLOC_BUFFER(self->pad_first) + ALLOC_BUFFER(self->pad_before_eq) + ALLOC_BUFFER(self->pad_after_eq) + self->quoter = 0; + self->reset = 0; + return self; + +#undef ALLOC_BUFFER +} + +/* + Deallocate the given TagData object. +*/ +void TagData_dealloc(TagData* self) +{ +#define DEALLOC_BUFFER(name) \ + if (name) \ + Textbuffer_dealloc(name); + + DEALLOC_BUFFER(self->pad_first); + DEALLOC_BUFFER(self->pad_before_eq); + DEALLOC_BUFFER(self->pad_after_eq); + free(self); + +#undef DEALLOC_BUFFER +} + +/* + Clear the internal buffers of the given TagData object. +*/ +int TagData_reset_buffers(TagData* self) +{ +#define RESET_BUFFER(name) \ + Textbuffer_dealloc(name); \ + name = Textbuffer_new(); \ + if (!name) \ + return -1; + + RESET_BUFFER(self->pad_first) + RESET_BUFFER(self->pad_before_eq) + RESET_BUFFER(self->pad_after_eq) + return 0; + +#undef RESET_BUFFER +} diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.h b/mwparserfromhell/parser/ctokenizer/tag_data.h new file mode 100644 index 0000000..e2ae807 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tag_data.h @@ -0,0 +1,43 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#include "common.h" +#include "textbuffer.h" + +/* Structs */ + +typedef struct { + uint64_t context; + Textbuffer* pad_first; + Textbuffer* pad_before_eq; + Textbuffer* pad_after_eq; + Py_UNICODE quoter; + Py_ssize_t reset; +} TagData; + +/* Functions */ + +TagData* TagData_new(void); +void TagData_dealloc(TagData*); +int TagData_reset_buffers(TagData*); diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h index 36b2207..389a9fe 100644 --- a/mwparserfromhell/parser/ctokenizer/textbuffer.h +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h @@ -20,17 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "common.h" - -/* Structs */ +#pragma once -struct Textbuffer { - Py_ssize_t size; - Py_UNICODE* data; - struct Textbuffer* prev; - struct Textbuffer* next; -}; -typedef struct Textbuffer Textbuffer; +#include "common.h" /* Functions */ diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c new file mode 100644 index 0000000..1e6424d --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -0,0 +1,2750 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "tok_parse.h" +#include "contexts.h" +#include "tag_data.h" +#include "tok_support.h" +#include "tokens.h" + +#define DIGITS "0123456789" +#define HEXDIGITS "0123456789abcdefABCDEF" +#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + +static const char MARKERS[] = { + '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', + '-', '!', '\n', '\0'}; + +#define NUM_MARKERS 19 +#define MAX_BRACES 255 +#define MAX_ENTITY_SIZE 8 + +#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL)) +#define IS_SCHEME(scheme, slashes, reverse) \ + (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) + +#ifdef IS_PY3K + #define NEW_INT_FUNC PyLong_FromSsize_t +#else + #define NEW_INT_FUNC PyInt_FromSsize_t +#endif + +typedef struct { + PyObject* title; + int level; +} HeadingData; + +/* Forward declarations */ + +static int Tokenizer_parse_entity(Tokenizer*); +static int Tokenizer_parse_comment(Tokenizer*); +static int Tokenizer_handle_dl_term(Tokenizer*); +static int Tokenizer_parse_tag(Tokenizer*); + +/* + Determine whether the given Py_UNICODE is a marker. +*/ +static int is_marker(Py_UNICODE this) +{ + int i; + + for (i = 0; i < NUM_MARKERS; i++) { + if (MARKERS[i] == this) + return 1; + } + return 0; +} + +/* + Given a context, return the heading level encoded within it. +*/ +static int heading_level_from_context(uint64_t n) +{ + int level; + + n /= LC_HEADING_LEVEL_1; + for (level = 1; n > 1; n >>= 1) + level++; + return level; +} + +/* + Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as + parameters, and return its output as a bool. +*/ +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, + PyObject* in3) +{ + PyObject* func = PyObject_GetAttrString(definitions, funcname); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); + int ans = (result == Py_True) ? 1 : 0; + + Py_DECREF(func); + Py_DECREF(result); + return ans; +} + +/* + Sanitize the name of a tag so it can be compared with others for equality. +*/ +static PyObject* strip_tag_name(PyObject* token, int take_attr) +{ + PyObject *text, *rstripped, *lowered; + + if (take_attr) { + text = PyObject_GetAttrString(token, "text"); + if (!text) + return NULL; + rstripped = PyObject_CallMethod(text, "rstrip", NULL); + Py_DECREF(text); + } + else + rstripped = PyObject_CallMethod(token, "rstrip", NULL); + if (!rstripped) + return NULL; + lowered = PyObject_CallMethod(rstripped, "lower", NULL); + Py_DECREF(rstripped); + return lowered; +} + +/* + Parse a template at the head of the wikicode string. +*/ +static int Tokenizer_parse_template(Tokenizer* self) +{ + PyObject *template; + Py_ssize_t reset = self->head; + + template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!template) + return -1; + if (Tokenizer_emit_first(self, TemplateOpen)) { + Py_DECREF(template); + return -1; + } + if (Tokenizer_emit_all(self, template)) { + Py_DECREF(template); + return -1; + } + Py_DECREF(template); + if (Tokenizer_emit(self, TemplateClose)) + return -1; + return 0; +} + +/* + Parse an argument at the head of the wikicode string. +*/ +static int Tokenizer_parse_argument(Tokenizer* self) +{ + PyObject *argument; + Py_ssize_t reset = self->head; + + argument = Tokenizer_parse(self, LC_ARGUMENT_NAME, 1); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!argument) + return -1; + if (Tokenizer_emit_first(self, ArgumentOpen)) { + Py_DECREF(argument); + return -1; + } + if (Tokenizer_emit_all(self, argument)) { + Py_DECREF(argument); + return -1; + } + Py_DECREF(argument); + if (Tokenizer_emit(self, ArgumentClose)) + return -1; + return 0; +} + +/* + Parse a template or argument at the head of the wikicode string. +*/ +static int Tokenizer_parse_template_or_argument(Tokenizer* self) +{ + unsigned int braces = 2, i; + PyObject *tokenlist; + + self->head += 2; + while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) { + self->head++; + braces++; + } + if (Tokenizer_push(self, 0)) + return -1; + while (braces) { + if (braces == 1) { + if (Tokenizer_emit_text_then_stack(self, "{")) + return -1; + return 0; + } + if (braces == 2) { + if (Tokenizer_parse_template(self)) + return -1; + if (BAD_ROUTE) { + RESET_ROUTE(); + if (Tokenizer_emit_text_then_stack(self, "{{")) + return -1; + return 0; + } + break; + } + if (Tokenizer_parse_argument(self)) + return -1; + if (BAD_ROUTE) { + RESET_ROUTE(); + if (Tokenizer_parse_template(self)) + return -1; + if (BAD_ROUTE) { + char text[MAX_BRACES + 1]; + RESET_ROUTE(); + for (i = 0; i < braces; i++) text[i] = '{'; + text[braces] = '\0'; + if (Tokenizer_emit_text_then_stack(self, text)) + return -1; + return 0; + } + else + braces -= 2; + } + else + braces -= 3; + if (braces) + self->head++; + } + tokenlist = Tokenizer_pop(self); + if (!tokenlist) + return -1; + if (Tokenizer_emit_all(self, tokenlist)) { + Py_DECREF(tokenlist); + return -1; + } + Py_DECREF(tokenlist); + if (self->topstack->context & LC_FAIL_NEXT) + self->topstack->context ^= LC_FAIL_NEXT; + return 0; +} + +/* + Handle a template parameter at the head of the string. +*/ +static int Tokenizer_handle_template_param(Tokenizer* self) +{ + PyObject *stack; + + if (self->topstack->context & LC_TEMPLATE_NAME) + self->topstack->context ^= LC_TEMPLATE_NAME; + else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) + self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; + if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { + stack = Tokenizer_pop_keeping_context(self); + if (!stack) + return -1; + if (Tokenizer_emit_all(self, stack)) { + Py_DECREF(stack); + return -1; + } + Py_DECREF(stack); + } + else + self->topstack->context |= LC_TEMPLATE_PARAM_KEY; + if (Tokenizer_emit(self, TemplateParamSeparator)) + return -1; + if (Tokenizer_push(self, self->topstack->context)) + return -1; + return 0; +} + +/* + Handle a template parameter's value at the head of the string. +*/ +static int Tokenizer_handle_template_param_value(Tokenizer* self) +{ + PyObject *stack; + + stack = Tokenizer_pop_keeping_context(self); + if (!stack) + return -1; + if (Tokenizer_emit_all(self, stack)) { + Py_DECREF(stack); + return -1; + } + Py_DECREF(stack); + self->topstack->context ^= LC_TEMPLATE_PARAM_KEY; + self->topstack->context |= LC_TEMPLATE_PARAM_VALUE; + if (Tokenizer_emit(self, TemplateParamEquals)) + return -1; + return 0; +} + +/* + Handle the end of a template at the head of the string. +*/ +static PyObject* Tokenizer_handle_template_end(Tokenizer* self) +{ + PyObject* stack; + + if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { + stack = Tokenizer_pop_keeping_context(self); + if (!stack) + return NULL; + if (Tokenizer_emit_all(self, stack)) { + Py_DECREF(stack); + return NULL; + } + Py_DECREF(stack); + } + self->head++; + stack = Tokenizer_pop(self); + return stack; +} + +/* + Handle the separator between an argument's name and default. +*/ +static int Tokenizer_handle_argument_separator(Tokenizer* self) +{ + self->topstack->context ^= LC_ARGUMENT_NAME; + self->topstack->context |= LC_ARGUMENT_DEFAULT; + if (Tokenizer_emit(self, ArgumentSeparator)) + return -1; + return 0; +} + +/* + Handle the end of an argument at the head of the string. +*/ +static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) +{ + PyObject* stack = Tokenizer_pop(self); + + self->head += 2; + return stack; +} + +/* + Parse an internal wikilink at the head of the wikicode string. +*/ +static int Tokenizer_parse_wikilink(Tokenizer* self) +{ + Py_ssize_t reset; + PyObject *wikilink; + + self->head += 2; + reset = self->head - 1; + wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_text(self, "[[")) + return -1; + return 0; + } + if (!wikilink) + return -1; + if (Tokenizer_emit(self, WikilinkOpen)) { + Py_DECREF(wikilink); + return -1; + } + if (Tokenizer_emit_all(self, wikilink)) { + Py_DECREF(wikilink); + return -1; + } + Py_DECREF(wikilink); + if (Tokenizer_emit(self, WikilinkClose)) + return -1; + return 0; +} + +/* + Handle the separator between a wikilink's title and its text. +*/ +static int Tokenizer_handle_wikilink_separator(Tokenizer* self) +{ + self->topstack->context ^= LC_WIKILINK_TITLE; + self->topstack->context |= LC_WIKILINK_TEXT; + if (Tokenizer_emit(self, WikilinkSeparator)) + return -1; + return 0; +} + +/* + Handle the end of a wikilink at the head of the string. +*/ +static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) +{ + PyObject* stack = Tokenizer_pop(self); + self->head += 1; + return stack; +} + +/* + Parse the URI scheme of a bracket-enclosed external link. +*/ +static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer* buffer; + PyObject* scheme; + Py_UNICODE this; + int slashes, i; + + if (Tokenizer_push(self, LC_EXT_LINK_URI)) + return -1; + if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } + else { + buffer = Textbuffer_new(); + if (!buffer) + return -1; + while ((this = Tokenizer_READ(self, 0))) { + i = 0; + while (1) { + if (!valid[i]) + goto end_of_loop; + if (this == valid[i]) + break; + i++; + } + Textbuffer_write(&buffer, this); + if (Tokenizer_emit_char(self, this)) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + } + end_of_loop: + if (this != ':') { + Textbuffer_dealloc(buffer); + Tokenizer_fail_route(self); + return 0; + } + if (Tokenizer_emit_char(self, ':')) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + slashes = (Tokenizer_READ(self, 0) == '/' && + Tokenizer_READ(self, 1) == '/'); + if (slashes) { + if (Tokenizer_emit_text(self, "//")) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head += 2; + } + scheme = Textbuffer_render(buffer); + Textbuffer_dealloc(buffer); + if (!scheme) + return -1; + if (!IS_SCHEME(scheme, slashes, 0)) { + Py_DECREF(scheme); + Tokenizer_fail_route(self); + return 0; + } + Py_DECREF(scheme); + } + return 0; +} + +/* + Parse the URI scheme of a free (no brackets) external link. +*/ +static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; + PyObject *scheme; + Py_UNICODE chunk; + Py_ssize_t i; + int slashes, j; + + if (!scheme_buffer) + return -1; + // We have to backtrack through the textbuffer looking for our scheme since + // it was just parsed as text: + temp_buffer = self->topstack->textbuffer; + while (temp_buffer) { + for (i = temp_buffer->size - 1; i >= 0; i--) { + chunk = temp_buffer->data[i]; + if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + goto end_of_loop; + j = 0; + while (1) { + if (!valid[j]) { + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + if (chunk == valid[j]) + break; + j++; + } + Textbuffer_write(&scheme_buffer, chunk); + } + temp_buffer = temp_buffer->next; + } + end_of_loop: + scheme = Textbuffer_render(scheme_buffer); + if (!scheme) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + slashes = (Tokenizer_READ(self, 0) == '/' && + Tokenizer_READ(self, 1) == '/'); + if (!IS_SCHEME(scheme, slashes, 1)) { + Py_DECREF(scheme); + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + Py_DECREF(scheme); + if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) + return -1; + if (Tokenizer_emit_char(self, ':')) + return -1; + if (slashes) { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } + return 0; +} + +/* + Handle text in a free external link, including trailing punctuation. +*/ +static int +Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, + Textbuffer** tail, Py_UNICODE this) +{ + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + if (Tokenizer_emit_textbuffer(self, tail, 0)) \ + return error; \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ + } + + if (this == '(' && !(*parens)) { + *parens = 1; + PUSH_TAIL_BUFFER(*tail, -1) + } + else if (this == ',' || this == ';' || this == '\\' || this == '.' || + this == ':' || this == '!' || this == '?' || + (!(*parens) && this == ')')) + return Textbuffer_write(tail, this); + else + PUSH_TAIL_BUFFER(*tail, -1) + return Tokenizer_emit_char(self, this); +} + +/* + Return whether the current head is the end of a free link. +*/ +static int +Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) +{ + // Built from Tokenizer_parse()'s end sentinels: + Py_UNICODE after = Tokenizer_READ(self, 2); + uint64_t ctx = self->topstack->context; + + return (!this || this == '\n' || this == '[' || this == ']' || + this == '<' || this == '>' || (this == '\'' && next == '\'') || + (this == '|' && ctx & LC_TEMPLATE) || + (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || + (this == '}' && next == '}' && + (ctx & LC_TEMPLATE || (after == '}' && ctx & LC_ARGUMENT)))); +} + +/* + Really parse an external link. +*/ +static PyObject* +Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, + Textbuffer** extra) +{ + Py_UNICODE this, next; + int parens = 0; + + if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : + Tokenizer_parse_free_uri_scheme(self)) + return NULL; + if (BAD_ROUTE) + return NULL; + this = Tokenizer_READ(self, 0); + if (!this || this == '\n' || this == ' ' || this == ']') + return Tokenizer_fail_route(self); + if (!brackets && this == '[') + return Tokenizer_fail_route(self); + while (1) { + this = Tokenizer_READ(self, 0); + next = Tokenizer_READ(self, 1); + if (this == '&') { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_entity(self)) + return NULL; + } + else if (this == '<' && next == '!' + && Tokenizer_READ(self, 2) == '-' + && Tokenizer_READ(self, 3) == '-') { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_comment(self)) + return NULL; + } + else if (!brackets && Tokenizer_is_free_link(self, this, next)) { + self->head--; + return Tokenizer_pop(self); + } + else if (!this || this == '\n') + return Tokenizer_fail_route(self); + else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_template_or_argument(self)) + return NULL; + } + else if (this == ']') + return Tokenizer_pop(self); + else if (this == ' ') { + if (brackets) { + if (Tokenizer_emit(self, ExternalLinkSeparator)) + return NULL; + self->topstack->context ^= LC_EXT_LINK_URI; + self->topstack->context |= LC_EXT_LINK_TITLE; + self->head++; + return Tokenizer_parse(self, 0, 0); + } + if (Textbuffer_write(extra, ' ')) + return NULL; + return Tokenizer_pop(self); + } + else if (!brackets) { + if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) + return NULL; + } + else { + if (Tokenizer_emit_char(self, this)) + return NULL; + } + self->head++; + } +} + +/* + Remove the URI scheme of a new external link from the textbuffer. +*/ +static int +Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) +{ + PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), + *split, *scheme; + Py_ssize_t length; + Textbuffer* temp; + + if (!text) + return -1; + split = PyObject_CallMethod(text, "split", "si", ":", 1); + Py_DECREF(text); + if (!split) + return -1; + scheme = PyList_GET_ITEM(split, 0); + length = PyUnicode_GET_SIZE(scheme); + while (length) { + temp = self->topstack->textbuffer; + if (length <= temp->size) { + temp->size -= length; + break; + } + length -= temp->size; + self->topstack->textbuffer = temp->next; + free(temp->data); + free(temp); + } + Py_DECREF(split); + return 0; +} + +/* + Parse an external link at the head of the wikicode string. +*/ +static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) +{ + #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS + #define NOT_A_LINK \ + if (!brackets && self->topstack->context & LC_DLTERM) \ + return Tokenizer_handle_dl_term(self); \ + return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + + Py_ssize_t reset = self->head; + PyObject *link, *kwargs; + Textbuffer *extra = 0; + + if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { + NOT_A_LINK; + } + extra = Textbuffer_new(); + if (!extra) + return -1; + self->head++; + link = Tokenizer_really_parse_external_link(self, brackets, &extra); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + Textbuffer_dealloc(extra); + NOT_A_LINK; + } + if (!link) { + Textbuffer_dealloc(extra); + return -1; + } + if (!brackets) { + if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + } + kwargs = PyDict_New(); + if (!kwargs) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); + if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + if (Tokenizer_emit_all(self, link)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + Py_DECREF(link); + if (Tokenizer_emit(self, ExternalLinkClose)) { + Textbuffer_dealloc(extra); + return -1; + } + if (extra->size || extra->next) + return Tokenizer_emit_textbuffer(self, extra, 0); + Textbuffer_dealloc(extra); + return 0; +} + +/* + Parse a section heading at the head of the wikicode string. +*/ +static int Tokenizer_parse_heading(Tokenizer* self) +{ + Py_ssize_t reset = self->head; + int best = 1, i, context, diff; + HeadingData *heading; + PyObject *level, *kwargs; + + self->global |= GL_HEADING; + self->head += 1; + while (Tokenizer_READ(self, 0) == '=') { + best++; + self->head++; + } + context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1); + heading = (HeadingData*) Tokenizer_parse(self, context, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset + best - 1; + for (i = 0; i < best; i++) { + if (Tokenizer_emit_char(self, '=')) + return -1; + } + self->global ^= GL_HEADING; + return 0; + } + level = NEW_INT_FUNC(heading->level); + if (!level) { + Py_DECREF(heading->title); + free(heading); + return -1; + } + kwargs = PyDict_New(); + if (!kwargs) { + Py_DECREF(level); + Py_DECREF(heading->title); + free(heading); + return -1; + } + PyDict_SetItemString(kwargs, "level", level); + Py_DECREF(level); + if (Tokenizer_emit_kwargs(self, HeadingStart, kwargs)) { + Py_DECREF(heading->title); + free(heading); + return -1; + } + if (heading->level < best) { + diff = best - heading->level; + for (i = 0; i < diff; i++) { + if (Tokenizer_emit_char(self, '=')) { + Py_DECREF(heading->title); + free(heading); + return -1; + } + } + } + if (Tokenizer_emit_all(self, heading->title)) { + Py_DECREF(heading->title); + free(heading); + return -1; + } + Py_DECREF(heading->title); + free(heading); + if (Tokenizer_emit(self, HeadingEnd)) + return -1; + self->global ^= GL_HEADING; + return 0; +} + +/* + Handle the end of a section heading at the head of the string. +*/ +static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) +{ + Py_ssize_t reset = self->head; + int best, i, current, level, diff; + HeadingData *after, *heading; + PyObject *stack; + + self->head += 1; + best = 1; + while (Tokenizer_READ(self, 0) == '=') { + best++; + self->head++; + } + current = heading_level_from_context(self->topstack->context); + level = current > best ? (best > 6 ? 6 : best) : + (current > 6 ? 6 : current); + after = (HeadingData*) Tokenizer_parse(self, self->topstack->context, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + if (level < best) { + diff = best - level; + for (i = 0; i < diff; i++) { + if (Tokenizer_emit_char(self, '=')) + return NULL; + } + } + self->head = reset + best - 1; + } + else { + for (i = 0; i < best; i++) { + if (Tokenizer_emit_char(self, '=')) { + Py_DECREF(after->title); + free(after); + return NULL; + } + } + if (Tokenizer_emit_all(self, after->title)) { + Py_DECREF(after->title); + free(after); + return NULL; + } + Py_DECREF(after->title); + level = after->level; + free(after); + } + stack = Tokenizer_pop(self); + if (!stack) + return NULL; + heading = malloc(sizeof(HeadingData)); + if (!heading) { + PyErr_NoMemory(); + return NULL; + } + heading->title = stack; + heading->level = level; + return heading; +} + +/* + Actually parse an HTML entity and ensure that it is valid. +*/ +static int Tokenizer_really_parse_entity(Tokenizer* self) +{ + PyObject *kwargs, *textobj; + Py_UNICODE this; + int numeric, hexadecimal, i, j, zeroes, test; + char *valid, *text, *buffer, *def; + + #define FAIL_ROUTE_AND_EXIT() { \ + Tokenizer_fail_route(self); \ + free(text); \ + return 0; \ + } + + if (Tokenizer_emit(self, HTMLEntityStart)) + return -1; + self->head++; + this = Tokenizer_READ(self, 0); + if (!this) { + Tokenizer_fail_route(self); + return 0; + } + if (this == '#') { + numeric = 1; + if (Tokenizer_emit(self, HTMLEntityNumeric)) + return -1; + self->head++; + this = Tokenizer_READ(self, 0); + if (!this) { + Tokenizer_fail_route(self); + return 0; + } + if (this == 'x' || this == 'X') { + hexadecimal = 1; + kwargs = PyDict_New(); + if (!kwargs) + return -1; + PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0)); + if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs)) + return -1; + self->head++; + } + else + hexadecimal = 0; + } + else + numeric = hexadecimal = 0; + if (hexadecimal) + valid = HEXDIGITS; + else if (numeric) + valid = DIGITS; + else + valid = ALPHANUM; + text = calloc(MAX_ENTITY_SIZE, sizeof(char)); + if (!text) { + PyErr_NoMemory(); + return -1; + } + i = 0; + zeroes = 0; + while (1) { + this = Tokenizer_READ(self, 0); + if (this == ';') { + if (i == 0) + FAIL_ROUTE_AND_EXIT() + break; + } + if (i == 0 && this == '0') { + zeroes++; + self->head++; + continue; + } + if (i >= MAX_ENTITY_SIZE) + FAIL_ROUTE_AND_EXIT() + if (is_marker(this)) + FAIL_ROUTE_AND_EXIT() + j = 0; + while (1) { + if (!valid[j]) + FAIL_ROUTE_AND_EXIT() + if (this == valid[j]) + break; + j++; + } + text[i] = (char) this; + self->head++; + i++; + } + if (numeric) { + sscanf(text, (hexadecimal ? "%x" : "%d"), &test); + if (test < 1 || test > 0x10FFFF) + FAIL_ROUTE_AND_EXIT() + } + else { + i = 0; + while (1) { + def = entitydefs[i]; + if (!def) // We've reached the end of the defs without finding it + FAIL_ROUTE_AND_EXIT() + if (strcmp(text, def) == 0) + break; + i++; + } + } + if (zeroes) { + buffer = calloc(strlen(text) + zeroes + 1, sizeof(char)); + if (!buffer) { + free(text); + PyErr_NoMemory(); + return -1; + } + for (i = 0; i < zeroes; i++) + strcat(buffer, "0"); + strcat(buffer, text); + free(text); + text = buffer; + } + textobj = PyUnicode_FromString(text); + if (!textobj) { + free(text); + return -1; + } + free(text); + kwargs = PyDict_New(); + if (!kwargs) { + Py_DECREF(textobj); + return -1; + } + PyDict_SetItemString(kwargs, "text", textobj); + Py_DECREF(textobj); + if (Tokenizer_emit_kwargs(self, Text, kwargs)) + return -1; + if (Tokenizer_emit(self, HTMLEntityEnd)) + return -1; + return 0; +} + +/* + Parse an HTML entity at the head of the wikicode string. +*/ +static int Tokenizer_parse_entity(Tokenizer* self) +{ + Py_ssize_t reset = self->head; + PyObject *tokenlist; + + if (Tokenizer_push(self, 0)) + return -1; + if (Tokenizer_really_parse_entity(self)) + return -1; + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_char(self, '&')) + return -1; + return 0; + } + tokenlist = Tokenizer_pop(self); + if (!tokenlist) + return -1; + if (Tokenizer_emit_all(self, tokenlist)) { + Py_DECREF(tokenlist); + return -1; + } + Py_DECREF(tokenlist); + return 0; +} + +/* + Parse an HTML comment at the head of the wikicode string. +*/ +static int Tokenizer_parse_comment(Tokenizer* self) +{ + Py_ssize_t reset = self->head + 3; + PyObject *comment; + Py_UNICODE this; + + self->head += 4; + if (Tokenizer_push(self, 0)) + return -1; + while (1) { + this = Tokenizer_READ(self, 0); + if (!this) { + comment = Tokenizer_pop(self); + Py_XDECREF(comment); + self->head = reset; + return Tokenizer_emit_text(self, "