|
|
@@ -0,0 +1,322 @@ |
|
|
|
/* |
|
|
|
Tokenizer for MWParserFromHell |
|
|
|
Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net> |
|
|
|
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy of |
|
|
|
this software and associated documentation files (the "Software"), to deal in |
|
|
|
the Software without restriction, including without limitation the rights to |
|
|
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies |
|
|
|
of the Software, and to permit persons to whom the Software is furnished to do |
|
|
|
so, subject to the following conditions: |
|
|
|
|
|
|
|
The above copyright notice and this permission notice shall be included in all |
|
|
|
copies or substantial portions of the Software. |
|
|
|
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
|
|
SOFTWARE. |
|
|
|
*/ |
|
|
|
|
|
|
|
#ifndef PY_SSIZE_T_CLEAN |
|
|
|
#define PY_SSIZE_T_CLEAN |
|
|
|
#endif |
|
|
|
|
|
|
|
#include <Python.h> |
|
|
|
#include "structmember.h" |
|
|
|
|
|
|
|
static const Py_UNICODE* OUT_OF_BOUNDS = ""; |
|
|
|
static const Py_UNICODE* MARKERS[] = {"{", "}", "[", "]", "<", ">", "|", "=", |
|
|
|
"&", "#", "*", ";", ":", "/", "-", "!", |
|
|
|
"\n", OUT_OF_BOUNDS}; |
|
|
|
|
|
|
|
static PyMethodDef |
|
|
|
module_methods[] = { |
|
|
|
{NULL} |
|
|
|
}; |
|
|
|
|
|
|
|
typedef struct { |
|
|
|
PyObject_HEAD |
|
|
|
PyObject* text; /* text to tokenize */ |
|
|
|
PyObject* stacks; /* token stacks */ |
|
|
|
PyObject* topstack; /* topmost stack */ |
|
|
|
Py_ssize_t head; /* current position in text */ |
|
|
|
Py_ssize_t length; /* length of text */ |
|
|
|
Py_ssize_t global; /* global context */ |
|
|
|
} Tokenizer; |
|
|
|
|
|
|
|
static PyObject* |
|
|
|
Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) |
|
|
|
{ |
|
|
|
Tokenizer *self; |
|
|
|
|
|
|
|
self = (Tokenizer*) type->tp_alloc(type, 0); |
|
|
|
if (self != NULL) { |
|
|
|
|
|
|
|
self->text = Py_None; |
|
|
|
Py_INCREF(Py_None); |
|
|
|
|
|
|
|
self->stacks = PyList_New(0); |
|
|
|
if (self->stacks == NULL) { |
|
|
|
Py_DECREF(self); |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
|
|
|
|
self->head = 0; |
|
|
|
self->length = 0; |
|
|
|
self->global = 0; |
|
|
|
} |
|
|
|
|
|
|
|
return (PyObject*) self; |
|
|
|
} |
|
|
|
|
|
|
|
static void |
|
|
|
Tokenizer_dealloc(Tokenizer* self) |
|
|
|
{ |
|
|
|
Py_XDECREF(self->text); |
|
|
|
Py_XDECREF(self->stacks); |
|
|
|
Py_XDECREF(self->topstack); |
|
|
|
self->ob_type->tp_free((PyObject*) self); |
|
|
|
} |
|
|
|
|
|
|
|
static int |
|
|
|
Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) |
|
|
|
{ |
|
|
|
static char* kwlist[] = {NULL}; |
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) |
|
|
|
return -1; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
#define Tokenizer_STACK(self) PyList_GET_ITEM(self->topstack, 0) |
|
|
|
#define Tokenizer_CONTEXT(self) PyList_GET_ITEM(self->topstack, 1) |
|
|
|
#define Tokenizer_TEXTBUFFER(self) PyList_GET_ITEM(self->topstack, 2) |
|
|
|
|
|
|
|
static int |
|
|
|
Tokenizer_set_context(Tokenizer* self, Py_ssize_t value) |
|
|
|
{ |
|
|
|
if (PyList_SetItem(self->topstack, 1, PyInt_FromSsize_t(value))) |
|
|
|
return -1; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
static int |
|
|
|
Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value) |
|
|
|
{ |
|
|
|
if (PyList_SetItem(self->topstack, 2, value)) |
|
|
|
return -1; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
Add a new token stack, context, and textbuffer to the list. |
|
|
|
*/ |
|
|
|
static int |
|
|
|
Tokenizer_push(Tokenizer* self, int context) |
|
|
|
{ |
|
|
|
PyObject* top = PyList_New(3); |
|
|
|
PyList_SET_ITEM(top, 0, PyList_New(0)); |
|
|
|
PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(0)); |
|
|
|
PyList_SET_ITEM(top, 2, PyList_New(0)); |
|
|
|
|
|
|
|
Py_XDECREF(self->topstack); |
|
|
|
self->topstack = top; |
|
|
|
|
|
|
|
if (PyList_Append(self->stacks, top)) |
|
|
|
return -1; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
Push the textbuffer onto the stack as a Text node and clear it. |
|
|
|
*/ |
|
|
|
static int |
|
|
|
Tokenizer_push_textbuffer(Tokenizer* self) |
|
|
|
{ |
|
|
|
if (PyList_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) { |
|
|
|
|
|
|
|
PyObject* text; |
|
|
|
// tokens.Text(text="".join(self._textbuffer)) |
|
|
|
|
|
|
|
if (PyList_Append(Tokenizer_STACK(self), text) |
|
|
|
return -1; |
|
|
|
|
|
|
|
if (Tokenizer_set_textbuffer(self, PyList_New(0))) |
|
|
|
return -1; |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
Pop the current stack/context/textbuffer, returing the stack. |
|
|
|
*/ |
|
|
|
static PyObject* |
|
|
|
Tokenizer_pop(Tokenizer* self) |
|
|
|
{ |
|
|
|
if (Tokenizer_push_textbuffer(self)) |
|
|
|
return NULL; |
|
|
|
|
|
|
|
self->stacks // POP!? |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
Pop the current stack/context/textbuffer, returing the stack. We will also |
|
|
|
replace the underlying stack's context with the current stack's. |
|
|
|
*/ |
|
|
|
static PyObject* |
|
|
|
Tokenizer_pop_keeping_context(Tokenizer* self) |
|
|
|
{ |
|
|
|
if (Tokenizer_push_textbuffer(self)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
Read the value at a relative point in the wikicode. |
|
|
|
*/ |
|
|
|
static Py_UNICODE* |
|
|
|
Tokenizer_read(Tokenizer* self, Py_ssize_t delta) |
|
|
|
{ |
|
|
|
Py_ssize_t index = self->head + delta; |
|
|
|
|
|
|
|
if (index >= self->length) { |
|
|
|
return OUT_OF_BOUNDS; |
|
|
|
} |
|
|
|
|
|
|
|
PyObject* item = PySequence_Fast_GET_ITEM(self->text, index); |
|
|
|
return PyUnicode_AS_UNICODE(item); |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
Parse the wikicode string, using *context* for when to stop. |
|
|
|
*/ |
|
|
|
static PyObject* |
|
|
|
Tokenizer_parse(Tokenizer* self, int context) |
|
|
|
{ |
|
|
|
Py_UNICODE* this; |
|
|
|
|
|
|
|
Tokenizer_push(self, context); |
|
|
|
|
|
|
|
while (1) { |
|
|
|
this = Tokenizer_read(self, 0); |
|
|
|
if (this not in MARKERS) { |
|
|
|
WRITE TEXT |
|
|
|
} |
|
|
|
if (this == OUT_OF_BOUNDS) { |
|
|
|
return Tokenizer_push(self); |
|
|
|
} |
|
|
|
printf("%p %i %c\n", this, *this, *this); |
|
|
|
self->head++; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
Build a list of tokens from a string of wikicode and return it. |
|
|
|
*/ |
|
|
|
static PyObject* |
|
|
|
Tokenizer_tokenize(Tokenizer* self, PyObject *args) |
|
|
|
{ |
|
|
|
PyObject* text; |
|
|
|
|
|
|
|
if (!PyArg_ParseTuple(args, "U", &text)) { |
|
|
|
/* Failed to parse a Unicode object; try a string instead. */ |
|
|
|
PyErr_Clear(); |
|
|
|
const char* encoded; |
|
|
|
Py_ssize_t size; |
|
|
|
|
|
|
|
if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) { |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
|
|
|
|
PyObject* temp; |
|
|
|
temp = PyUnicode_FromStringAndSize(encoded, size); |
|
|
|
if (text == NULL) |
|
|
|
return NULL; |
|
|
|
|
|
|
|
Py_XDECREF(self->text); |
|
|
|
text = PySequence_Fast(temp, "expected a sequence"); |
|
|
|
Py_XDECREF(temp); |
|
|
|
self->text = text; |
|
|
|
} |
|
|
|
else { |
|
|
|
Py_XDECREF(self->text); |
|
|
|
self->text = PySequence_Fast(text, "expected a sequence"); |
|
|
|
} |
|
|
|
|
|
|
|
self->length = PySequence_Length(self->text); |
|
|
|
|
|
|
|
return Tokenizer_parse(self, 0); |
|
|
|
} |
|
|
|
|
|
|
|
static PyMethodDef |
|
|
|
Tokenizer_methods[] = { |
|
|
|
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, |
|
|
|
"Build a list of tokens from a string of wikicode and return it."}, |
|
|
|
{NULL} |
|
|
|
}; |
|
|
|
|
|
|
|
static PyMemberDef |
|
|
|
Tokenizer_members[] = { |
|
|
|
{NULL} |
|
|
|
}; |
|
|
|
|
|
|
|
static PyTypeObject |
|
|
|
TokenizerType = { |
|
|
|
PyObject_HEAD_INIT(NULL) |
|
|
|
0, /* ob_size */ |
|
|
|
"_tokenizer.CTokenizer", /* tp_name */ |
|
|
|
sizeof(Tokenizer), /* tp_basicsize */ |
|
|
|
0, /* tp_itemsize */ |
|
|
|
(destructor) Tokenizer_dealloc, /* tp_dealloc */ |
|
|
|
0, /* tp_print */ |
|
|
|
0, /* tp_getattr */ |
|
|
|
0, /* tp_setattr */ |
|
|
|
0, /* tp_compare */ |
|
|
|
0, /* tp_repr */ |
|
|
|
0, /* tp_as_number */ |
|
|
|
0, /* tp_as_sequence */ |
|
|
|
0, /* tp_as_mapping */ |
|
|
|
0, /* tp_hash */ |
|
|
|
0, /* tp_call */ |
|
|
|
0, /* tp_str */ |
|
|
|
0, /* tp_getattro */ |
|
|
|
0, /* tp_setattro */ |
|
|
|
0, /* tp_as_buffer */ |
|
|
|
Py_TPFLAGS_DEFAULT, /* tp_flags */ |
|
|
|
"Creates a list of tokens from a string of wikicode.", /* tp_doc */ |
|
|
|
0, /* tp_traverse */ |
|
|
|
0, /* tp_clear */ |
|
|
|
0, /* tp_richcompare */ |
|
|
|
0, /* tp_weaklistoffset */ |
|
|
|
0, /* tp_iter */ |
|
|
|
0, /* tp_iternext */ |
|
|
|
Tokenizer_methods, /* tp_methods */ |
|
|
|
Tokenizer_members, /* tp_members */ |
|
|
|
0, /* tp_getset */ |
|
|
|
0, /* tp_base */ |
|
|
|
0, /* tp_dict */ |
|
|
|
0, /* tp_descr_get */ |
|
|
|
0, /* tp_descr_set */ |
|
|
|
0, /* tp_dictoffset */ |
|
|
|
(initproc) Tokenizer_init, /* tp_init */ |
|
|
|
0, /* tp_alloc */ |
|
|
|
Tokenizer_new, /* tp_new */ |
|
|
|
}; |
|
|
|
|
|
|
|
PyMODINIT_FUNC |
|
|
|
init_tokenizer(void) |
|
|
|
{ |
|
|
|
PyObject* module; |
|
|
|
|
|
|
|
TokenizerType.tp_new = PyType_GenericNew; |
|
|
|
if (PyType_Ready(&TokenizerType) < 0) |
|
|
|
return; |
|
|
|
|
|
|
|
module = Py_InitModule("_tokenizer", module_methods); |
|
|
|
|
|
|
|
Py_INCREF(&TokenizerType); |
|
|
|
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); |
|
|
|
} |