diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ad013cb..41713e2 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -21,45 +21,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef PY_SSIZE_T_CLEAN -#define PY_SSIZE_T_CLEAN -#endif - -#include -#include -#include - -static PyObject* EMPTY; - -#define PU (Py_UNICODE*) -static const Py_UNICODE* MARKERS[] = {PU"{", PU"}", PU"[", PU"]", PU"<", PU">", - PU"|", PU"=", PU"&", PU"#", PU"*", PU";", - PU":", PU"/", PU"-", PU"!", PU"\n", PU""}; -static const int NUM_MARKERS = 17; - -#define CONTEXT(name) PyInt_AsSsize_t((PyIntObject*) \ - PyObject_GetAttrString(contexts, name)) - -static jmp_buf exception_env; -static const int BAD_ROUTE = 1; - -static PyObject* contexts; -static PyObject* tokens; - -static PyMethodDef -module_methods[] = { - {NULL} -}; - -typedef struct { - PyObject_HEAD - PyObject* text; /* text to tokenize */ - PyObject* stacks; /* token stacks */ - PyObject* topstack; /* topmost stack */ - Py_ssize_t head; /* current position in text */ - Py_ssize_t length; /* length of text */ - Py_ssize_t global; /* global context */ -} Tokenizer; +#include "tokenizer.h" static PyObject* Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) @@ -104,11 +66,6 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) return 0; } -#define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) -#define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) -#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t((PyIntObject*) Tokenizer_CONTEXT(self)) -#define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) - static int Tokenizer_set_context(Tokenizer* self, Py_ssize_t value) { @@ -539,9 +496,7 @@ Tokenizer_parse_comment(Tokenizer* self) static PyObject* Tokenizer_parse(Tokenizer* self, Py_ssize_t context) { - Py_ssize_t fail_contexts = ( - CONTEXT("TEMPLATE") | CONTEXT("ARGUMENT") | CONTEXT("HEADING") | - CONTEXT("COMMENT")); + Py_ssize_t fail_contexts = LC_TEMPLATE | LC_ARGUMENT | LC_HEADING | LC_COMMENT; PyObject *this, *next; Py_UNICODE *this_data, *next_data, *next_next_data, *last_data; @@ -580,7 +535,7 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) next = Tokenizer_read(self, 1); next_data = PyUnicode_AS_UNICODE(next); - if (this_context & CONTEXT("COMMENT")) { + if (this_context & LC_COMMENT) { if (this_data == next_data && next_data == PU "-") { if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU ">") { return Tokenizer_pop(self); @@ -591,42 +546,40 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) else if (this_data == next_data && next_data == PU "{") { Tokenizer_parse_template_or_argument(self); } - else if (this_data == PU "|" && this_context & CONTEXT("TEMPLATE")) { + else if (this_data == PU "|" && this_context & LC_TEMPLATE) { Tokenizer_handle_template_param(self); } - else if (this_data == PU "=" && this_context & CONTEXT("TEMPLATE_PARAM_KEY")) { + else if (this_data == PU "=" && this_context & LC_TEMPLATE_PARAM_KEY) { Tokenizer_handle_template_param_value(self); } - else if (this_data == next_data && next_data == PU "}" && - this_context & CONTEXT("TEMPLATE")) { + else if (this_data == next_data && next_data == PU "}" && this_context & LC_TEMPLATE) { Tokenizer_handle_template_end(self); } - else if (this_data == PU "|" && this_context & CONTEXT("ARGUMENT_NAME")) { + else if (this_data == PU "|" && this_context & LC_ARGUMENT_NAME) { Tokenizer_handle_argument_separator(self); } - else if (this_data == next_data && next_data == PU "}" && - this_context & CONTEXT("ARGUMENT")) { + else if (this_data == next_data && next_data == PU "}" && this_context & LC_ARGUMENT) { if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU "}") { return Tokenizer_handle_argument_end(self); } Tokenizer_write_text(self, this); } else if (this_data == next_data && next_data == PU "[") { - if (!(this_context & CONTEXT("WIKILINK_TITLE"))) { + if (!(this_context & LC_WIKILINK_TITLE)) { Tokenizer_parse_wikilink(self); } else { Tokenizer_write_text(self, this); } } - else if (this_data == PU "|" && this_context & CONTEXT("WIKILINK_TITLE")) { + else if (this_data == PU "|" && this_context & LC_WIKILINK_TITLE) { Tokenizer_handle_wikilink_separator(self); } else if (this_data == next_data && next_data == PU "]" && - this_context & CONTEXT("WIKILINK")) { + this_context & LC_WIKILINK) { return Tokenizer_handle_wikilink_end(self); } - else if (this_data == PU "=" && !(self->global & CONTEXT("GL_HEADING"))) { + else if (this_data == PU "=" && !(self->global & GL_HEADING)) { last_data = PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1)); if (last_data == PU "\n" || last_data == PU "") { Tokenizer_parse_heading(self); @@ -635,10 +588,10 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) Tokenizer_write_text(self, this); } } - else if (this_data == PU "=" && this_context & CONTEXT("HEADING")) { + else if (this_data == PU "=" && this_context & LC_HEADING) { return Tokenizer_handle_heading_end(self); } - else if (this_data == PU "\n" && this_context & CONTEXT("HEADING")) { + else if (this_data == PU "\n" && this_context & LC_HEADING) { Tokenizer_fail_route(self); } else if (this_data == PU "&") { @@ -700,61 +653,6 @@ Tokenizer_tokenize(Tokenizer* self, PyObject *args) return Tokenizer_parse(self, 0); } -static PyMethodDef -Tokenizer_methods[] = { - {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, - "Build a list of tokens from a string of wikicode and return it."}, - {NULL} -}; - -static PyMemberDef -Tokenizer_members[] = { - {NULL} -}; - -static PyTypeObject -TokenizerType = { - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ - "_tokenizer.CTokenizer", /* tp_name */ - sizeof(Tokenizer), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor) Tokenizer_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - "Creates a list of tokens from a string of wikicode.", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - Tokenizer_methods, /* tp_methods */ - Tokenizer_members, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc) Tokenizer_init, /* tp_init */ - 0, /* tp_alloc */ - Tokenizer_new, /* tp_new */ -}; - PyMODINIT_FUNC init_tokenizer(void) { @@ -775,7 +673,6 @@ init_tokenizer(void) PyObject* locals = PyEval_GetLocals(); PyObject* fromlist = PyList_New(0); - contexts = PyImport_ImportModuleLevel("contexts", globals, locals, fromlist, 1); tokens = PyImport_ImportModuleLevel("tokens", globals, locals, fromlist, 1); Py_DECREF(fromlist); } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h new file mode 100644 index 0000000..c504dd8 --- /dev/null +++ b/mwparserfromhell/parser/tokenizer.h @@ -0,0 +1,199 @@ +/* +Tokenizer Header File for MWParserFromHell +Copyright (C) 2012 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN +#endif + +#include +#include +#include + +#define PU (Py_UNICODE*) + +static const Py_UNICODE* MARKERS[] = { + PU "{", PU "}", PU "[", PU "]", PU "<", PU ">", PU "|", PU "=", PU "&", + PU "#", PU "*", PU ";", PU ":", PU "/", PU "-", PU "!", PU "\n", PU ""}; +static const int NUM_MARKERS = 17; + +static jmp_buf exception_env; +static const int BAD_ROUTE = 1; + +static PyObject* EMPTY; +static PyObject* tokens; + + +/* Local contexts: */ + +static const Py_ssize_t LC_TEMPLATE = 0x0007; +static const Py_ssize_t LC_TEMPLATE_NAME = 0x0001; +static const Py_ssize_t LC_TEMPLATE_PARAM_KEY = 0x0002; +static const Py_ssize_t LC_TEMPLATE_PARAM_VALUE = 0x0004; + +static const Py_ssize_t LC_ARGUMENT = 0x0018; +static const Py_ssize_t LC_ARGUMENT_NAME = 0x0008; +static const Py_ssize_t LC_ARGUMENT_DEFAULT = 0x0010; + +static const Py_ssize_t LC_WIKILINK = 0x0060; +static const Py_ssize_t LC_WIKILINK_TITLE = 0x0020; +static const Py_ssize_t LC_WIKILINK_TEXT = 0x0040; + +static const Py_ssize_t LC_HEADING = 0x1f80; +static const Py_ssize_t LC_HEADING_LEVEL_1 = 0x0080; +static const Py_ssize_t LC_HEADING_LEVEL_2 = 0x0100; +static const Py_ssize_t LC_HEADING_LEVEL_3 = 0x0200; +static const Py_ssize_t LC_HEADING_LEVEL_4 = 0x0400; +static const Py_ssize_t LC_HEADING_LEVEL_5 = 0x0800; +static const Py_ssize_t LC_HEADING_LEVEL_6 = 0x1000; + +static const Py_ssize_t LC_COMMENT = 0x2000; + + +/* Global contexts: */ + +static const Py_ssize_t GL_HEADING = 0x1; + + +/* Tokenizer object definition: */ + +typedef struct { + PyObject_HEAD + PyObject* text; /* text to tokenize */ + PyObject* stacks; /* token stacks */ + PyObject* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + Py_ssize_t length; /* length of text */ + Py_ssize_t global; /* global context */ +} Tokenizer; + + +/* Some macros for accessing Tokenizer data: */ + +#define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) +#define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) +#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t(Tokenizer_CONTEXT(self)) +#define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) + + +/* Tokenizer function prototypes: */ + +static PyObject* Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds); +static void Tokenizer_dealloc(Tokenizer* self); +static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds); +static int Tokenizer_set_context(Tokenizer* self, Py_ssize_t value); +static int Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value); +static int Tokenizer_push(Tokenizer* self, Py_ssize_t context); +static int Tokenizer_push_textbuffer(Tokenizer* self); +static int Tokenizer_delete_top_of_stack(Tokenizer* self); +static PyObject* Tokenizer_pop(Tokenizer* self); +static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self); +static void Tokenizer_fail_route(Tokenizer* self); +static int Tokenizer_write(Tokenizer* self, PyObject* token); +static int Tokenizer_write_first(Tokenizer* self, PyObject* token); +static int Tokenizer_write_text(Tokenizer* self, PyObject* text); +static int Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist); +static int Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text); +static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta); +static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta); +static int Tokenizer_parse_template_or_argument(Tokenizer* self); +static int Tokenizer_parse_template(Tokenizer* self); +static int Tokenizer_parse_argument(Tokenizer* self); +static int Tokenizer_verify_safe(Tokenizer* self, Py_UNICODE* unsafes[]); +static int Tokenizer_handle_template_param(Tokenizer* self); +static int Tokenizer_handle_template_param_value(Tokenizer* self); +static PyObject* Tokenizer_handle_template_end(Tokenizer* self); +static int Tokenizer_handle_argument_separator(Tokenizer* self); +static PyObject* Tokenizer_handle_argument_end(Tokenizer* self); +static int Tokenizer_parse_wikilink(Tokenizer* self); +static int Tokenizer_handle_wikilink_separator(Tokenizer* self); +static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self); +static int Tokenizer_parse_heading(Tokenizer* self); +static PyObject* Tokenizer_handle_heading_end(Tokenizer* self); +static int Tokenizer_really_parse_entity(Tokenizer* self); +static int Tokenizer_parse_entity(Tokenizer* self); +static int Tokenizer_parse_comment(Tokenizer* self); +static PyObject* Tokenizer_parse(Tokenizer* self, Py_ssize_t context); +static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject *args); + + +/* More structs for creating the Tokenizer type: */ + +static PyMethodDef +Tokenizer_methods[] = { + {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, + "Build a list of tokens from a string of wikicode and return it."}, + {NULL} +}; + +static PyMemberDef +Tokenizer_members[] = { + {NULL} +}; + +static PyMethodDef +module_methods[] = { + {NULL} +}; + +static PyTypeObject +TokenizerType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "_tokenizer.CTokenizer", /* tp_name */ + sizeof(Tokenizer), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor) Tokenizer_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Creates a list of tokens from a string of wikicode.", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Tokenizer_methods, /* tp_methods */ + Tokenizer_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc) Tokenizer_init, /* tp_init */ + 0, /* tp_alloc */ + Tokenizer_new, /* tp_new */ +};