Browse Source

Merge branch 'feature/c_improvements' into develop (closes #18)

tags/v0.3
Ben Kurtovic 11 years ago
parent
commit
2305cc7d83
3 changed files with 256 additions and 427 deletions
  1. +168
    -365
      mwparserfromhell/parser/tokenizer.c
  2. +87
    -58
      mwparserfromhell/parser/tokenizer.h
  3. +1
    -4
      setup.py

+ 168
- 365
mwparserfromhell/parser/tokenizer.c
File diff suppressed because it is too large
View File


+ 87
- 58
mwparserfromhell/parser/tokenizer.h View File

@@ -28,6 +28,7 @@ SOFTWARE.
#include <Python.h> #include <Python.h>
#include <math.h> #include <math.h>
#include <structmember.h> #include <structmember.h>
#include <bytesobject.h>


#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
#define IS_PY3K #define IS_PY3K
@@ -103,55 +104,59 @@ static PyObject* TagCloseClose;


/* Local contexts: */ /* Local contexts: */


#define LC_TEMPLATE 0x00000007
#define LC_TEMPLATE_NAME 0x00000001
#define LC_TEMPLATE_PARAM_KEY 0x00000002
#define LC_TEMPLATE_PARAM_VALUE 0x00000004

#define LC_ARGUMENT 0x00000018
#define LC_ARGUMENT_NAME 0x00000008
#define LC_ARGUMENT_DEFAULT 0x00000010

#define LC_WIKILINK 0x00000060
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_HEADING 0x00001F80
#define LC_HEADING_LEVEL_1 0x00000080
#define LC_HEADING_LEVEL_2 0x00000100
#define LC_HEADING_LEVEL_3 0x00000200
#define LC_HEADING_LEVEL_4 0x00000400
#define LC_HEADING_LEVEL_5 0x00000800
#define LC_HEADING_LEVEL_6 0x00001000

#define LC_COMMENT 0x00002000

#define LC_TAG 0x0003C000
#define LC_TAG_OPEN 0x00004000
#define LC_TAG_ATTR 0x00008000
#define LC_TAG_BODY 0x00010000
#define LC_TAG_CLOSE 0x00020000

#define LC_STYLE 0x003C0000
#define LC_STYLE_ITALICS 0x00040000
#define LC_STYLE_BOLD 0x00080000
#define LC_STYLE_PASS_AGAIN 0x00100000
#define LC_STYLE_SECOND_PASS 0x00200000

#define LC_DLTERM 0x00400000

#define LC_SAFETY_CHECK 0x1F800000
#define LC_HAS_TEXT 0x00800000
#define LC_FAIL_ON_TEXT 0x01000000
#define LC_FAIL_NEXT 0x02000000
#define LC_FAIL_ON_LBRACE 0x04000000
#define LC_FAIL_ON_RBRACE 0x08000000
#define LC_FAIL_ON_EQUALS 0x10000000
#define LC_TEMPLATE 0x0000007
#define LC_TEMPLATE_NAME 0x0000001
#define LC_TEMPLATE_PARAM_KEY 0x0000002
#define LC_TEMPLATE_PARAM_VALUE 0x0000004

#define LC_ARGUMENT 0x0000018
#define LC_ARGUMENT_NAME 0x0000008
#define LC_ARGUMENT_DEFAULT 0x0000010

#define LC_WIKILINK 0x0000060
#define LC_WIKILINK_TITLE 0x0000020
#define LC_WIKILINK_TEXT 0x0000040

#define LC_HEADING 0x0001F80
#define LC_HEADING_LEVEL_1 0x0000080
#define LC_HEADING_LEVEL_2 0x0000100
#define LC_HEADING_LEVEL_3 0x0000200
#define LC_HEADING_LEVEL_4 0x0000400
#define LC_HEADING_LEVEL_5 0x0000800
#define LC_HEADING_LEVEL_6 0x0001000

#define LC_TAG 0x001E000
#define LC_TAG_OPEN 0x0002000
#define LC_TAG_ATTR 0x0004000
#define LC_TAG_BODY 0x0008000
#define LC_TAG_CLOSE 0x0010000

#define LC_STYLE 0x01E0000
#define LC_STYLE_ITALICS 0x0020000
#define LC_STYLE_BOLD 0x0040000
#define LC_STYLE_PASS_AGAIN 0x0080000
#define LC_STYLE_SECOND_PASS 0x0100000

#define LC_DLTERM 0x0200000

#define LC_SAFETY_CHECK 0xFC00000
#define LC_HAS_TEXT 0x0400000
#define LC_FAIL_ON_TEXT 0x0800000
#define LC_FAIL_NEXT 0x1000000
#define LC_FAIL_ON_LBRACE 0x2000000
#define LC_FAIL_ON_RBRACE 0x4000000
#define LC_FAIL_ON_EQUALS 0x8000000


/* Global contexts: */ /* Global contexts: */


#define GL_HEADING 0x1 #define GL_HEADING 0x1


/* Aggregate contexts: */

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)

/* Tag contexts: */ /* Tag contexts: */


#define TAG_NAME 0x01 #define TAG_NAME 0x01
@@ -210,13 +215,18 @@ typedef struct {
} Tokenizer; } Tokenizer;




/* Macros for accessing Tokenizer data: */
/* Macros related to Tokenizer functions: */


#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))) #define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
#define Tokenizer_READ_BACKWARDS(self, delta) \ #define Tokenizer_READ_BACKWARDS(self, delta) \
(*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta))) (*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta)))
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) #define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)


#define Tokenizer_emit(self, token) Tokenizer_emit_token(self, token, 0)
#define Tokenizer_emit_first(self, token) Tokenizer_emit_token(self, token, 1)
#define Tokenizer_emit_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
#define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1)



/* Macros for accessing HTML tag definitions: */ /* Macros for accessing HTML tag definitions: */


@@ -242,29 +252,39 @@ static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);




/* Macros for Python 2/3 compatibility: */

#ifdef IS_PY3K
#define NEW_INT_FUNC PyLong_FromSsize_t
#define IMPORT_NAME_FUNC PyUnicode_FromString
#define CREATE_MODULE PyModule_Create(&module_def);
#define ENTITYDEFS_MODULE "html.entities"
#define INIT_FUNC_NAME PyInit__tokenizer
#define INIT_ERROR return NULL
#else
#define NEW_INT_FUNC PyInt_FromSsize_t
#define IMPORT_NAME_FUNC PyBytes_FromString
#define CREATE_MODULE Py_InitModule("_tokenizer", NULL);
#define ENTITYDEFS_MODULE "htmlentitydefs"
#define INIT_FUNC_NAME init_tokenizer
#define INIT_ERROR return
#endif


/* More structs for creating the Tokenizer type: */ /* More structs for creating the Tokenizer type: */


static PyMethodDef
Tokenizer_methods[] = {
static PyMethodDef Tokenizer_methods[] = {
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
"Build a list of tokens from a string of wikicode and return it."}, "Build a list of tokens from a string of wikicode and return it."},
{NULL} {NULL}
}; };


static PyMemberDef
Tokenizer_members[] = {
static PyMemberDef Tokenizer_members[] = {
{NULL} {NULL}
}; };


static PyMethodDef
module_methods[] = {
{NULL}
};

static PyTypeObject
TokenizerType = {
PyObject_HEAD_INIT(NULL)
0, /* ob_size */
static PyTypeObject TokenizerType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_tokenizer.CTokenizer", /* tp_name */ "_tokenizer.CTokenizer", /* tp_name */
sizeof(Tokenizer), /* tp_basicsize */ sizeof(Tokenizer), /* tp_basicsize */
0, /* tp_itemsize */ 0, /* tp_itemsize */
@@ -303,3 +323,12 @@ TokenizerType = {
0, /* tp_alloc */ 0, /* tp_alloc */
Tokenizer_new, /* tp_new */ Tokenizer_new, /* tp_new */
}; };

#ifdef IS_PY3K
static PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenizer",
"Creates a list of tokens from a string of wikicode.",
-1, NULL, NULL, NULL, NULL, NULL
};
#endif

+ 1
- 4
setup.py View File

@@ -29,16 +29,13 @@ from mwparserfromhell.compat import py3k
with open("README.rst") as fp: with open("README.rst") as fp:
long_docs = fp.read() long_docs = fp.read()


# builder = Extension("mwparserfromhell.parser._builder",
# sources = ["mwparserfromhell/parser/builder.c"])

tokenizer = Extension("mwparserfromhell.parser._tokenizer", tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources = ["mwparserfromhell/parser/tokenizer.c"]) sources = ["mwparserfromhell/parser/tokenizer.c"])


setup( setup(
name = "mwparserfromhell", name = "mwparserfromhell",
packages = find_packages(exclude=("tests",)), packages = find_packages(exclude=("tests",)),
ext_modules = [] if py3k else [tokenizer],
ext_modules = [tokenizer],
test_suite = "tests", test_suite = "tests",
version = __version__, version = __version__,
author = "Ben Kurtovic", author = "Ben Kurtovic",


Loading…
Cancel
Save