Browse Source

Begin porting C tokenizer to Python 3.

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
25d53cacf8
3 changed files with 65 additions and 37 deletions
  1. +50
    -20
      mwparserfromhell/parser/tokenizer.c
  2. +14
    -13
      mwparserfromhell/parser/tokenizer.h
  3. +1
    -4
      setup.py

+ 50
- 20
mwparserfromhell/parser/tokenizer.c View File

@@ -207,7 +207,7 @@ static void Tokenizer_dealloc(Tokenizer* self)
free(this); free(this);
this = next; this = next;
} }
self->ob_type->tp_free((PyObject*) self);
Py_TYPE(self)->tp_free((PyObject*) self);
} }


static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
@@ -835,7 +835,11 @@ static int Tokenizer_parse_heading(Tokenizer* self)
self->global ^= GL_HEADING; self->global ^= GL_HEADING;
return 0; return 0;
} }
#ifdef IS_PY3K
level = PyLong_FromSsize_t(heading->level);
#else
level = PyInt_FromSsize_t(heading->level); level = PyInt_FromSsize_t(heading->level);
#endif
if (!level) { if (!level) {
Py_DECREF(heading->title); Py_DECREF(heading->title);
free(heading); free(heading);
@@ -2299,30 +2303,40 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
return Tokenizer_parse(self, 0, 1); return Tokenizer_parse(self, 0, 1);
} }


static void load_entitydefs(void)
static int load_entitydefs(void)
{ {
PyObject *tempmod, *defmap, *deflist; PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i; unsigned numdefs, i;


#ifdef IS_PY3K
tempmod = PyImport_ImportModule("html.entities");
#else
tempmod = PyImport_ImportModule("htmlentitydefs"); tempmod = PyImport_ImportModule("htmlentitydefs");
#endif
if (!tempmod) if (!tempmod)
return;
return -1;
defmap = PyObject_GetAttrString(tempmod, "entitydefs"); defmap = PyObject_GetAttrString(tempmod, "entitydefs");
if (!defmap) if (!defmap)
return;
return -1;
Py_DECREF(tempmod); Py_DECREF(tempmod);
deflist = PyDict_Keys(defmap); deflist = PyDict_Keys(defmap);
if (!deflist) if (!deflist)
return;
return -1;
Py_DECREF(defmap); Py_DECREF(defmap);
numdefs = (unsigned) PyList_GET_SIZE(defmap); numdefs = (unsigned) PyList_GET_SIZE(defmap);
entitydefs = calloc(numdefs + 1, sizeof(char*)); entitydefs = calloc(numdefs + 1, sizeof(char*));
for (i = 0; i < numdefs; i++)
if (!entitydefs)
return -1;
for (i = 0; i < numdefs; i++) {
entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i));
if (!entitydefs[i])
return -1;
}
Py_DECREF(deflist); Py_DECREF(deflist);
return 0;
} }


static void load_tokens(void)
static int load_tokens(void)
{ {
PyObject *tempmod, *tokens, PyObject *tempmod, *tokens,
*globals = PyEval_GetGlobals(), *globals = PyEval_GetGlobals(),
@@ -2332,12 +2346,12 @@ static void load_tokens(void)
char *name = "mwparserfromhell.parser"; char *name = "mwparserfromhell.parser";


if (!fromlist || !modname) if (!fromlist || !modname)
return;
return -1;
PyList_SET_ITEM(fromlist, 0, modname); PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist); Py_DECREF(fromlist);
if (!tempmod) if (!tempmod)
return;
return -1;
tokens = PyObject_GetAttrString(tempmod, "tokens"); tokens = PyObject_GetAttrString(tempmod, "tokens");
Py_DECREF(tempmod); Py_DECREF(tempmod);


@@ -2379,9 +2393,10 @@ static void load_tokens(void)
TagCloseClose = PyObject_GetAttrString(tokens, "TagCloseClose"); TagCloseClose = PyObject_GetAttrString(tokens, "TagCloseClose");


Py_DECREF(tokens); Py_DECREF(tokens);
return 0;
} }


static void load_tag_defs(void)
static int load_tag_defs(void)
{ {
PyObject *tempmod, PyObject *tempmod,
*globals = PyEval_GetGlobals(), *globals = PyEval_GetGlobals(),
@@ -2391,33 +2406,48 @@ static void load_tag_defs(void)
char *name = "mwparserfromhell"; char *name = "mwparserfromhell";


if (!fromlist || !modname) if (!fromlist || !modname)
return;
return -1;
PyList_SET_ITEM(fromlist, 0, modname); PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist); Py_DECREF(fromlist);
if (!tempmod) if (!tempmod)
return;
return -1;
tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); tag_defs = PyObject_GetAttrString(tempmod, "tag_defs");
Py_DECREF(tempmod); Py_DECREF(tempmod);
return 0;
} }


PyMODINIT_FUNC init_tokenizer(void)
#ifdef IS_PY3K
#define INIT_ERROR return NULL
PyMODINIT_FUNC PyInit__tokenizer(void)
#else
#define INIT_ERROR return
PyMODINIT_FUNC init_tokenizer(void)
#endif
{ {
PyObject *module; PyObject *module;


TokenizerType.tp_new = PyType_GenericNew; TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0) if (PyType_Ready(&TokenizerType) < 0)
return;
module = Py_InitModule("_tokenizer", module_methods);
INIT_ERROR;
#ifdef IS_PY3K
module = PyModule_Create(&module_def);
#else
module = Py_InitModule("_tokenizer", NULL);
#endif
if (!module)
INIT_ERROR;
Py_INCREF(&TokenizerType); Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
Py_INCREF(Py_True); Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);

EMPTY = PyUnicode_FromString(""); EMPTY = PyUnicode_FromString("");
NOARGS = PyTuple_New(0); NOARGS = PyTuple_New(0);

load_entitydefs();
load_tokens();
load_tag_defs();
if (!EMPTY || !NOARGS)
INIT_ERROR;
if (load_entitydefs() || load_tokens() || load_tag_defs())
INIT_ERROR;
#ifdef IS_PY3K
return module;
#endif
} }

+ 14
- 13
mwparserfromhell/parser/tokenizer.h View File

@@ -28,6 +28,7 @@ SOFTWARE.
#include <Python.h> #include <Python.h>
#include <math.h> #include <math.h>
#include <structmember.h> #include <structmember.h>
#include <bytesobject.h>


#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
#define IS_PY3K #define IS_PY3K
@@ -253,27 +254,18 @@ static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);


/* More structs for creating the Tokenizer type: */ /* More structs for creating the Tokenizer type: */


static PyMethodDef
Tokenizer_methods[] = {
static PyMethodDef Tokenizer_methods[] = {
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
"Build a list of tokens from a string of wikicode and return it."}, "Build a list of tokens from a string of wikicode and return it."},
{NULL} {NULL}
}; };


static PyMemberDef
Tokenizer_members[] = {
static PyMemberDef Tokenizer_members[] = {
{NULL} {NULL}
}; };


static PyMethodDef
module_methods[] = {
{NULL}
};

static PyTypeObject
TokenizerType = {
PyObject_HEAD_INIT(NULL)
0, /* ob_size */
static PyTypeObject TokenizerType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_tokenizer.CTokenizer", /* tp_name */ "_tokenizer.CTokenizer", /* tp_name */
sizeof(Tokenizer), /* tp_basicsize */ sizeof(Tokenizer), /* tp_basicsize */
0, /* tp_itemsize */ 0, /* tp_itemsize */
@@ -312,3 +304,12 @@ TokenizerType = {
0, /* tp_alloc */ 0, /* tp_alloc */
Tokenizer_new, /* tp_new */ Tokenizer_new, /* tp_new */
}; };

#ifdef IS_PY3K
static PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenizer",
"Creates a list of tokens from a string of wikicode.",
-1, NULL, NULL, NULL, NULL, NULL
};
#endif

+ 1
- 4
setup.py View File

@@ -29,16 +29,13 @@ from mwparserfromhell.compat import py3k
with open("README.rst") as fp: with open("README.rst") as fp:
long_docs = fp.read() long_docs = fp.read()


# builder = Extension("mwparserfromhell.parser._builder",
# sources = ["mwparserfromhell/parser/builder.c"])

tokenizer = Extension("mwparserfromhell.parser._tokenizer", tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources = ["mwparserfromhell/parser/tokenizer.c"]) sources = ["mwparserfromhell/parser/tokenizer.c"])


setup( setup(
name = "mwparserfromhell", name = "mwparserfromhell",
packages = find_packages(exclude=("tests",)), packages = find_packages(exclude=("tests",)),
ext_modules = [] if py3k else [tokenizer],
ext_modules = [tokenizer],
test_suite = "tests", test_suite = "tests",
version = __version__, version = __version__,
author = "Ben Kurtovic", author = "Ben Kurtovic",


Loading…
Cancel
Save