diff --git a/docs/conf.py b/docs/conf.py index 6cc3664..cff089b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,6 +17,7 @@ import sys, os # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath('..')) +import mwparserfromhell # -- General configuration ----------------------------------------------------- @@ -48,9 +49,9 @@ copyright = u'2012 Ben Kurtovic' # built documents. # # The short X.Y version. -version = '0.2' +version = ".".join(mwparserfromhell.__version__.split(".", 2)[:2]) # The full version, including alpha/beta/rc tags. -release = '0.2.dev' +release = mwparserfromhell.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/mwparserfromhell/parser/builder.c b/mwparserfromhell/parser/builder.c new file mode 100644 index 0000000..7cbe236 --- /dev/null +++ b/mwparserfromhell/parser/builder.c @@ -0,0 +1,24 @@ +/* +Builder for MWParserFromHell +Copyright (C) 2012 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c new file mode 100644 index 0000000..3fdc370 --- /dev/null +++ b/mwparserfromhell/parser/tokenizer.c @@ -0,0 +1,322 @@ +/* +Tokenizer for MWParserFromHell +Copyright (C) 2012 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN +#endif + +#include +#include "structmember.h" + +static const Py_UNICODE* OUT_OF_BOUNDS = ""; +static const Py_UNICODE* MARKERS[] = {"{", "}", "[", "]", "<", ">", "|", "=", + "&", "#", "*", ";", ":", "/", "-", "!", + "\n", OUT_OF_BOUNDS}; + +static PyMethodDef +module_methods[] = { + {NULL} +}; + +typedef struct { + PyObject_HEAD + PyObject* text; /* text to tokenize */ + PyObject* stacks; /* token stacks */ + PyObject* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + Py_ssize_t length; /* length of text */ + Py_ssize_t global; /* global context */ +} Tokenizer; + +static PyObject* +Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) +{ + Tokenizer *self; + + self = (Tokenizer*) type->tp_alloc(type, 0); + if (self != NULL) { + + self->text = Py_None; + Py_INCREF(Py_None); + + self->stacks = PyList_New(0); + if (self->stacks == NULL) { + Py_DECREF(self); + return NULL; + } + + self->head = 0; + self->length = 0; + self->global = 0; + } + + return (PyObject*) self; +} + +static void +Tokenizer_dealloc(Tokenizer* self) +{ + Py_XDECREF(self->text); + Py_XDECREF(self->stacks); + Py_XDECREF(self->topstack); + self->ob_type->tp_free((PyObject*) self); +} + +static int +Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) +{ + static char* kwlist[] = {NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) + return -1; + return 0; +} + +#define Tokenizer_STACK(self) PyList_GET_ITEM(self->topstack, 0) +#define Tokenizer_CONTEXT(self) PyList_GET_ITEM(self->topstack, 1) +#define Tokenizer_TEXTBUFFER(self) PyList_GET_ITEM(self->topstack, 2) + +static int +Tokenizer_set_context(Tokenizer* self, Py_ssize_t value) +{ + if (PyList_SetItem(self->topstack, 1, PyInt_FromSsize_t(value))) + return -1; + return 0; +} + +static int +Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value) +{ + if (PyList_SetItem(self->topstack, 2, value)) + return -1; + return 0; +} + +/* + Add a new token stack, context, and textbuffer to the list. +*/ +static int +Tokenizer_push(Tokenizer* self, int context) +{ + PyObject* top = PyList_New(3); + PyList_SET_ITEM(top, 0, PyList_New(0)); + PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(0)); + PyList_SET_ITEM(top, 2, PyList_New(0)); + + Py_XDECREF(self->topstack); + self->topstack = top; + + if (PyList_Append(self->stacks, top)) + return -1; + return 0; +} + +/* + Push the textbuffer onto the stack as a Text node and clear it. +*/ +static int +Tokenizer_push_textbuffer(Tokenizer* self) +{ + if (PyList_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) { + + PyObject* text; + // tokens.Text(text="".join(self._textbuffer)) + + if (PyList_Append(Tokenizer_STACK(self), text) + return -1; + + if (Tokenizer_set_textbuffer(self, PyList_New(0))) + return -1; + + return 0; + } +} + +/* + Pop the current stack/context/textbuffer, returing the stack. +*/ +static PyObject* +Tokenizer_pop(Tokenizer* self) +{ + if (Tokenizer_push_textbuffer(self)) + return NULL; + + self->stacks // POP!? +} + +/* + Pop the current stack/context/textbuffer, returing the stack. We will also + replace the underlying stack's context with the current stack's. +*/ +static PyObject* +Tokenizer_pop_keeping_context(Tokenizer* self) +{ + if (Tokenizer_push_textbuffer(self)) + return NULL; +} + +/* + Read the value at a relative point in the wikicode. +*/ +static Py_UNICODE* +Tokenizer_read(Tokenizer* self, Py_ssize_t delta) +{ + Py_ssize_t index = self->head + delta; + + if (index >= self->length) { + return OUT_OF_BOUNDS; + } + + PyObject* item = PySequence_Fast_GET_ITEM(self->text, index); + return PyUnicode_AS_UNICODE(item); +} + +/* + Parse the wikicode string, using *context* for when to stop. +*/ +static PyObject* +Tokenizer_parse(Tokenizer* self, int context) +{ + Py_UNICODE* this; + + Tokenizer_push(self, context); + + while (1) { + this = Tokenizer_read(self, 0); + if (this not in MARKERS) { + WRITE TEXT + } + if (this == OUT_OF_BOUNDS) { + return Tokenizer_push(self); + } + printf("%p %i %c\n", this, *this, *this); + self->head++; + } +} + +/* + Build a list of tokens from a string of wikicode and return it. +*/ +static PyObject* +Tokenizer_tokenize(Tokenizer* self, PyObject *args) +{ + PyObject* text; + + if (!PyArg_ParseTuple(args, "U", &text)) { + /* Failed to parse a Unicode object; try a string instead. */ + PyErr_Clear(); + const char* encoded; + Py_ssize_t size; + + if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) { + return NULL; + } + + PyObject* temp; + temp = PyUnicode_FromStringAndSize(encoded, size); + if (text == NULL) + return NULL; + + Py_XDECREF(self->text); + text = PySequence_Fast(temp, "expected a sequence"); + Py_XDECREF(temp); + self->text = text; + } + else { + Py_XDECREF(self->text); + self->text = PySequence_Fast(text, "expected a sequence"); + } + + self->length = PySequence_Length(self->text); + + return Tokenizer_parse(self, 0); +} + +static PyMethodDef +Tokenizer_methods[] = { + {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, + "Build a list of tokens from a string of wikicode and return it."}, + {NULL} +}; + +static PyMemberDef +Tokenizer_members[] = { + {NULL} +}; + +static PyTypeObject +TokenizerType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "_tokenizer.CTokenizer", /* tp_name */ + sizeof(Tokenizer), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor) Tokenizer_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Creates a list of tokens from a string of wikicode.", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Tokenizer_methods, /* tp_methods */ + Tokenizer_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc) Tokenizer_init, /* tp_init */ + 0, /* tp_alloc */ + Tokenizer_new, /* tp_new */ +}; + +PyMODINIT_FUNC +init_tokenizer(void) +{ + PyObject* module; + + TokenizerType.tp_new = PyType_GenericNew; + if (PyType_Ready(&TokenizerType) < 0) + return; + + module = Py_InitModule("_tokenizer", module_methods); + + Py_INCREF(&TokenizerType); + PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); +} diff --git a/setup.py b/setup.py index 9faa56c..3664626 100644 --- a/setup.py +++ b/setup.py @@ -21,16 +21,23 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from setuptools import setup, find_packages +from setuptools import setup, find_packages, Extension from mwparserfromhell import __version__ with open("README.rst") as fp: long_docs = fp.read() +builder = Extension("mwparserfromhell.parser._builder", + sources = ["mwparserfromhell/parser/builder.c"]) + +tokenizer = Extension("mwparserfromhell.parser._tokenizer", + sources = ["mwparserfromhell/parser/tokenizer.c"]) + setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)), + ext_modules = [builder, tokenizer], test_suite = "tests", version = __version__, author = "Ben Kurtovic",