From 25d53cacf8abc76a55cbf1af1b77b4cb9b6b0f5c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 23:54:06 -0400 Subject: [PATCH] Begin porting C tokenizer to Python 3. --- mwparserfromhell/parser/tokenizer.c | 70 ++++++++++++++++++++++++++----------- mwparserfromhell/parser/tokenizer.h | 27 +++++++------- setup.py | 5 +-- 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 4df61d8..60223e1 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -207,7 +207,7 @@ static void Tokenizer_dealloc(Tokenizer* self) free(this); this = next; } - self->ob_type->tp_free((PyObject*) self); + Py_TYPE(self)->tp_free((PyObject*) self); } static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) @@ -835,7 +835,11 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } +#ifdef IS_PY3K + level = PyLong_FromSsize_t(heading->level); +#else level = PyInt_FromSsize_t(heading->level); +#endif if (!level) { Py_DECREF(heading->title); free(heading); @@ -2299,30 +2303,40 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) return Tokenizer_parse(self, 0, 1); } -static void load_entitydefs(void) +static int load_entitydefs(void) { PyObject *tempmod, *defmap, *deflist; unsigned numdefs, i; +#ifdef IS_PY3K + tempmod = PyImport_ImportModule("html.entities"); +#else tempmod = PyImport_ImportModule("htmlentitydefs"); +#endif if (!tempmod) - return; + return -1; defmap = PyObject_GetAttrString(tempmod, "entitydefs"); if (!defmap) - return; + return -1; Py_DECREF(tempmod); deflist = PyDict_Keys(defmap); if (!deflist) - return; + return -1; Py_DECREF(defmap); numdefs = (unsigned) PyList_GET_SIZE(defmap); entitydefs = calloc(numdefs + 1, sizeof(char*)); - for (i = 0; i < numdefs; i++) + if (!entitydefs) + return -1; + for (i = 0; i < numdefs; i++) { entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); + if (!entitydefs[i]) + return -1; + } Py_DECREF(deflist); + return 0; } -static void load_tokens(void) +static int load_tokens(void) { PyObject *tempmod, *tokens, *globals = PyEval_GetGlobals(), @@ -2332,12 +2346,12 @@ static void load_tokens(void) char *name = "mwparserfromhell.parser"; if (!fromlist || !modname) - return; + return -1; PyList_SET_ITEM(fromlist, 0, modname); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); Py_DECREF(fromlist); if (!tempmod) - return; + return -1; tokens = PyObject_GetAttrString(tempmod, "tokens"); Py_DECREF(tempmod); @@ -2379,9 +2393,10 @@ static void load_tokens(void) TagCloseClose = PyObject_GetAttrString(tokens, "TagCloseClose"); Py_DECREF(tokens); + return 0; } -static void load_tag_defs(void) +static int load_tag_defs(void) { PyObject *tempmod, *globals = PyEval_GetGlobals(), @@ -2391,33 +2406,48 @@ static void load_tag_defs(void) char *name = "mwparserfromhell"; if (!fromlist || !modname) - return; + return -1; PyList_SET_ITEM(fromlist, 0, modname); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); Py_DECREF(fromlist); if (!tempmod) - return; + return -1; tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); Py_DECREF(tempmod); + return 0; } -PyMODINIT_FUNC init_tokenizer(void) +#ifdef IS_PY3K + #define INIT_ERROR return NULL + PyMODINIT_FUNC PyInit__tokenizer(void) +#else + #define INIT_ERROR return + PyMODINIT_FUNC init_tokenizer(void) +#endif { PyObject *module; TokenizerType.tp_new = PyType_GenericNew; if (PyType_Ready(&TokenizerType) < 0) - return; - module = Py_InitModule("_tokenizer", module_methods); + INIT_ERROR; +#ifdef IS_PY3K + module = PyModule_Create(&module_def); +#else + module = Py_InitModule("_tokenizer", NULL); +#endif + if (!module) + INIT_ERROR; Py_INCREF(&TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); Py_INCREF(Py_True); PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); - EMPTY = PyUnicode_FromString(""); NOARGS = PyTuple_New(0); - - load_entitydefs(); - load_tokens(); - load_tag_defs(); + if (!EMPTY || !NOARGS) + INIT_ERROR; + if (load_entitydefs() || load_tokens() || load_tag_defs()) + INIT_ERROR; +#ifdef IS_PY3K + return module; +#endif } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index d5f755d..2bf6973 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -28,6 +28,7 @@ SOFTWARE. #include #include #include +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -253,27 +254,18 @@ static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); /* More structs for creating the Tokenizer type: */ -static PyMethodDef -Tokenizer_methods[] = { +static PyMethodDef Tokenizer_methods[] = { {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, "Build a list of tokens from a string of wikicode and return it."}, {NULL} }; -static PyMemberDef -Tokenizer_members[] = { +static PyMemberDef Tokenizer_members[] = { {NULL} }; -static PyMethodDef -module_methods[] = { - {NULL} -}; - -static PyTypeObject -TokenizerType = { - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ +static PyTypeObject TokenizerType = { + PyVarObject_HEAD_INIT(NULL, 0) "_tokenizer.CTokenizer", /* tp_name */ sizeof(Tokenizer), /* tp_basicsize */ 0, /* tp_itemsize */ @@ -312,3 +304,12 @@ TokenizerType = { 0, /* tp_alloc */ Tokenizer_new, /* tp_new */ }; + +#ifdef IS_PY3K +static PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_tokenizer", + "Creates a list of tokens from a string of wikicode.", + -1, NULL, NULL, NULL, NULL, NULL +}; +#endif diff --git a/setup.py b/setup.py index 8b4ae86..5e6d779 100644 --- a/setup.py +++ b/setup.py @@ -29,16 +29,13 @@ from mwparserfromhell.compat import py3k with open("README.rst") as fp: long_docs = fp.read() -# builder = Extension("mwparserfromhell.parser._builder", -# sources = ["mwparserfromhell/parser/builder.c"]) - tokenizer = Extension("mwparserfromhell.parser._tokenizer", sources = ["mwparserfromhell/parser/tokenizer.c"]) setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)), - ext_modules = [] if py3k else [tokenizer], + ext_modules = [tokenizer], test_suite = "tests", version = __version__, author = "Ben Kurtovic",