Browse Source

Start C port of tag tokenization; refactor the init func.

tags/v0.3
Ben Kurtovic 11 years ago
parent
commit
f67cf46900
2 changed files with 121 additions and 58 deletions
  1. +76
    -25
      mwparserfromhell/parser/tokenizer.c
  2. +45
    -33
      mwparserfromhell/parser/tokenizer.h

+ 76
- 25
mwparserfromhell/parser/tokenizer.c View File

@@ -35,6 +35,22 @@ static int heading_level_from_context(int n)
return level;
}

/*
Call the given function in tag_defs, using 'tag' as a parameter, and return
its output as a bool.
*/
static int
call_tag_def_func(const char* funcname, PyObject* tag)
{
PyObject* func = PyObject_GetAttrString(tag_defs, funcname);
PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL);
int ans = (result == Py_True) ? 1 : 0;

Py_DECREF(func);
Py_DECREF(result);
return ans;
}

static PyObject*
Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
{
@@ -1418,22 +1434,11 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
return Tokenizer_parse(self, 0);
}

PyMODINIT_FUNC
init_tokenizer(void)
static void
load_entitydefs(void)
{
PyObject *module, *tempmod, *defmap, *deflist, *globals, *locals,
*fromlist, *modname;
PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i;
char *name;

TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0)
return;
module = Py_InitModule("_tokenizer", module_methods);
Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);

tempmod = PyImport_ImportModule("htmlentitydefs");
if (!tempmod)
@@ -1451,18 +1456,19 @@ init_tokenizer(void)
for (i = 0; i < numdefs; i++)
entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i));
Py_DECREF(deflist);
}

EMPTY = PyUnicode_FromString("");
NOARGS = PyTuple_New(0);
name = "mwparserfromhell.parser";
globals = PyEval_GetGlobals();
locals = PyEval_GetLocals();
fromlist = PyList_New(1);
if (!fromlist)
return;
modname = PyBytes_FromString("tokens");
if (!modname)
static void
load_tokens(void)
{
PyObject *tempmod, *tokens,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = PyBytes_FromString("tokens");
char *name = "mwparserfromhell.parser";
if (!fromlist || !modname)
return;
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
@@ -1508,4 +1514,49 @@ init_tokenizer(void)
TagCloseSelfclose = PyObject_GetAttrString(tokens, "TagCloseSelfclose");
TagOpenClose = PyObject_GetAttrString(tokens, "TagOpenClose");
TagCloseClose = PyObject_GetAttrString(tokens, "TagCloseClose");

Py_DECREF(tokens);
}

static void
load_tag_defs(void)
{
PyObject *tempmod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = PyBytes_FromString("tag_defs");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
return;
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
return;
tag_defs = PyObject_GetAttrString(tempmod, "tag_defs");
Py_DECREF(tempmod);
}

PyMODINIT_FUNC
init_tokenizer(void)
{
PyObject *module;

TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0)
return;
module = Py_InitModule("_tokenizer", module_methods);
Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);

EMPTY = PyUnicode_FromString("");
NOARGS = PyTuple_New(0);

load_entitydefs();
load_tokens();
load_tag_defs();
}

+ 45
- 33
mwparserfromhell/parser/tokenizer.h View File

@@ -60,10 +60,10 @@ static char** entitydefs;

static PyObject* EMPTY;
static PyObject* NOARGS;
static PyObject* tokens;
static PyObject* tag_defs;


/* Tokens */
/* Tokens: */

static PyObject* Text;

@@ -102,36 +102,42 @@ static PyObject* TagCloseClose;

/* Local contexts: */

#define LC_TEMPLATE 0x00007
#define LC_TEMPLATE_NAME 0x00001
#define LC_TEMPLATE_PARAM_KEY 0x00002
#define LC_TEMPLATE_PARAM_VALUE 0x00004

#define LC_ARGUMENT 0x00018
#define LC_ARGUMENT_NAME 0x00008
#define LC_ARGUMENT_DEFAULT 0x00010

#define LC_WIKILINK 0x00060
#define LC_WIKILINK_TITLE 0x00020
#define LC_WIKILINK_TEXT 0x00040

#define LC_HEADING 0x01F80
#define LC_HEADING_LEVEL_1 0x00080
#define LC_HEADING_LEVEL_2 0x00100
#define LC_HEADING_LEVEL_3 0x00200
#define LC_HEADING_LEVEL_4 0x00400
#define LC_HEADING_LEVEL_5 0x00800
#define LC_HEADING_LEVEL_6 0x01000

#define LC_COMMENT 0x02000

#define LC_SAFETY_CHECK 0xFC000
#define LC_HAS_TEXT 0x04000
#define LC_FAIL_ON_TEXT 0x08000
#define LC_FAIL_NEXT 0x10000
#define LC_FAIL_ON_LBRACE 0x20000
#define LC_FAIL_ON_RBRACE 0x40000
#define LC_FAIL_ON_EQUALS 0x80000
#define LC_TEMPLATE 0x000007
#define LC_TEMPLATE_NAME 0x000001
#define LC_TEMPLATE_PARAM_KEY 0x000002
#define LC_TEMPLATE_PARAM_VALUE 0x000004

#define LC_ARGUMENT 0x000018
#define LC_ARGUMENT_NAME 0x000008
#define LC_ARGUMENT_DEFAULT 0x000010

#define LC_WIKILINK 0x000060
#define LC_WIKILINK_TITLE 0x000020
#define LC_WIKILINK_TEXT 0x000040

#define LC_HEADING 0x001F80
#define LC_HEADING_LEVEL_1 0x000080
#define LC_HEADING_LEVEL_2 0x000100
#define LC_HEADING_LEVEL_3 0x000200
#define LC_HEADING_LEVEL_4 0x000400
#define LC_HEADING_LEVEL_5 0x000800
#define LC_HEADING_LEVEL_6 0x001000

#define LC_COMMENT 0x002000

#define LC_TAG 0x03C000
#define LC_TAG_OPEN 0x004000
#define LC_TAG_ATTR 0x008000
#define LC_TAG_BODY 0x010000
#define LC_TAG_CLOSE 0x020000

#define LC_SAFETY_CHECK 0xFC0000
#define LC_HAS_TEXT 0x040000
#define LC_FAIL_ON_TEXT 0x080000
#define LC_FAIL_NEXT 0x100000
#define LC_FAIL_ON_LBRACE 0x200000
#define LC_FAIL_ON_RBRACE 0x400000
#define LC_FAIL_ON_EQUALS 0x800000

/* Global contexts: */

@@ -179,9 +185,15 @@ typedef struct {
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)


/* Macros for accessing HTML tag definitions: */

#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag))
#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag))
#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))


/* Function prototypes: */

static int heading_level_from_context(int);
static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static struct Textbuffer* Textbuffer_new(void);
static void Tokenizer_dealloc(Tokenizer*);


Loading…
Cancel
Save