From f67cf46900aebf3bc07c8fb2814ec06c9701e05b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 10 Jul 2013 03:58:47 -0400 Subject: [PATCH] Start C port of tag tokenization; refactor the init func. --- mwparserfromhell/parser/tokenizer.c | 101 +++++++++++++++++++++++++++--------- mwparserfromhell/parser/tokenizer.h | 78 ++++++++++++++++------------ 2 files changed, 121 insertions(+), 58 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 99f8c9c..e575d2e 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -35,6 +35,22 @@ static int heading_level_from_context(int n) return level; } +/* + Call the given function in tag_defs, using 'tag' as a parameter, and return + its output as a bool. +*/ +static int +call_tag_def_func(const char* funcname, PyObject* tag) +{ + PyObject* func = PyObject_GetAttrString(tag_defs, funcname); + PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL); + int ans = (result == Py_True) ? 1 : 0; + + Py_DECREF(func); + Py_DECREF(result); + return ans; +} + static PyObject* Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) { @@ -1418,22 +1434,11 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args) return Tokenizer_parse(self, 0); } -PyMODINIT_FUNC -init_tokenizer(void) +static void +load_entitydefs(void) { - PyObject *module, *tempmod, *defmap, *deflist, *globals, *locals, - *fromlist, *modname; + PyObject *tempmod, *defmap, *deflist; unsigned numdefs, i; - char *name; - - TokenizerType.tp_new = PyType_GenericNew; - if (PyType_Ready(&TokenizerType) < 0) - return; - module = Py_InitModule("_tokenizer", module_methods); - Py_INCREF(&TokenizerType); - PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); - Py_INCREF(Py_True); - PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); tempmod = PyImport_ImportModule("htmlentitydefs"); if (!tempmod) @@ -1451,18 +1456,19 @@ init_tokenizer(void) for (i = 0; i < numdefs; i++) entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); Py_DECREF(deflist); +} - EMPTY = PyUnicode_FromString(""); - NOARGS = PyTuple_New(0); - - name = "mwparserfromhell.parser"; - globals = PyEval_GetGlobals(); - locals = PyEval_GetLocals(); - fromlist = PyList_New(1); - if (!fromlist) - return; - modname = PyBytes_FromString("tokens"); - if (!modname) +static void +load_tokens(void) +{ + PyObject *tempmod, *tokens, + *globals = PyEval_GetGlobals(), + *locals = PyEval_GetLocals(), + *fromlist = PyList_New(1), + *modname = PyBytes_FromString("tokens"); + char *name = "mwparserfromhell.parser"; + + if (!fromlist || !modname) return; PyList_SET_ITEM(fromlist, 0, modname); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); @@ -1508,4 +1514,49 @@ init_tokenizer(void) TagCloseSelfclose = PyObject_GetAttrString(tokens, "TagCloseSelfclose"); TagOpenClose = PyObject_GetAttrString(tokens, "TagOpenClose"); TagCloseClose = PyObject_GetAttrString(tokens, "TagCloseClose"); + + Py_DECREF(tokens); +} + +static void +load_tag_defs(void) +{ + PyObject *tempmod, + *globals = PyEval_GetGlobals(), + *locals = PyEval_GetLocals(), + *fromlist = PyList_New(1), + *modname = PyBytes_FromString("tag_defs"); + char *name = "mwparserfromhell"; + + if (!fromlist || !modname) + return; + PyList_SET_ITEM(fromlist, 0, modname); + tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); + Py_DECREF(fromlist); + if (!tempmod) + return; + tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); + Py_DECREF(tempmod); +} + +PyMODINIT_FUNC +init_tokenizer(void) +{ + PyObject *module; + + TokenizerType.tp_new = PyType_GenericNew; + if (PyType_Ready(&TokenizerType) < 0) + return; + module = Py_InitModule("_tokenizer", module_methods); + Py_INCREF(&TokenizerType); + PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); + Py_INCREF(Py_True); + PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); + + EMPTY = PyUnicode_FromString(""); + NOARGS = PyTuple_New(0); + + load_entitydefs(); + load_tokens(); + load_tag_defs(); } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 1f58c49..c81c0bf 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -60,10 +60,10 @@ static char** entitydefs; static PyObject* EMPTY; static PyObject* NOARGS; -static PyObject* tokens; +static PyObject* tag_defs; -/* Tokens */ +/* Tokens: */ static PyObject* Text; @@ -102,36 +102,42 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x00007 -#define LC_TEMPLATE_NAME 0x00001 -#define LC_TEMPLATE_PARAM_KEY 0x00002 -#define LC_TEMPLATE_PARAM_VALUE 0x00004 - -#define LC_ARGUMENT 0x00018 -#define LC_ARGUMENT_NAME 0x00008 -#define LC_ARGUMENT_DEFAULT 0x00010 - -#define LC_WIKILINK 0x00060 -#define LC_WIKILINK_TITLE 0x00020 -#define LC_WIKILINK_TEXT 0x00040 - -#define LC_HEADING 0x01F80 -#define LC_HEADING_LEVEL_1 0x00080 -#define LC_HEADING_LEVEL_2 0x00100 -#define LC_HEADING_LEVEL_3 0x00200 -#define LC_HEADING_LEVEL_4 0x00400 -#define LC_HEADING_LEVEL_5 0x00800 -#define LC_HEADING_LEVEL_6 0x01000 - -#define LC_COMMENT 0x02000 - -#define LC_SAFETY_CHECK 0xFC000 -#define LC_HAS_TEXT 0x04000 -#define LC_FAIL_ON_TEXT 0x08000 -#define LC_FAIL_NEXT 0x10000 -#define LC_FAIL_ON_LBRACE 0x20000 -#define LC_FAIL_ON_RBRACE 0x40000 -#define LC_FAIL_ON_EQUALS 0x80000 +#define LC_TEMPLATE 0x000007 +#define LC_TEMPLATE_NAME 0x000001 +#define LC_TEMPLATE_PARAM_KEY 0x000002 +#define LC_TEMPLATE_PARAM_VALUE 0x000004 + +#define LC_ARGUMENT 0x000018 +#define LC_ARGUMENT_NAME 0x000008 +#define LC_ARGUMENT_DEFAULT 0x000010 + +#define LC_WIKILINK 0x000060 +#define LC_WIKILINK_TITLE 0x000020 +#define LC_WIKILINK_TEXT 0x000040 + +#define LC_HEADING 0x001F80 +#define LC_HEADING_LEVEL_1 0x000080 +#define LC_HEADING_LEVEL_2 0x000100 +#define LC_HEADING_LEVEL_3 0x000200 +#define LC_HEADING_LEVEL_4 0x000400 +#define LC_HEADING_LEVEL_5 0x000800 +#define LC_HEADING_LEVEL_6 0x001000 + +#define LC_COMMENT 0x002000 + +#define LC_TAG 0x03C000 +#define LC_TAG_OPEN 0x004000 +#define LC_TAG_ATTR 0x008000 +#define LC_TAG_BODY 0x010000 +#define LC_TAG_CLOSE 0x020000 + +#define LC_SAFETY_CHECK 0xFC0000 +#define LC_HAS_TEXT 0x040000 +#define LC_FAIL_ON_TEXT 0x080000 +#define LC_FAIL_NEXT 0x100000 +#define LC_FAIL_ON_LBRACE 0x200000 +#define LC_FAIL_ON_RBRACE 0x400000 +#define LC_FAIL_ON_EQUALS 0x800000 /* Global contexts: */ @@ -179,9 +185,15 @@ typedef struct { #define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) +/* Macros for accessing HTML tag definitions: */ + +#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) +#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) +#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) + + /* Function prototypes: */ -static int heading_level_from_context(int); static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); static struct Textbuffer* Textbuffer_new(void); static void Tokenizer_dealloc(Tokenizer*);