A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.
 
 
 
 

501 righe
14 KiB

  1. /*
  2. Tokenizer for MWParserFromHell
  3. Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. Permission is hereby granted, free of charge, to any person obtaining a copy of
  5. this software and associated documentation files (the "Software"), to deal in
  6. the Software without restriction, including without limitation the rights to
  7. use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  8. of the Software, and to permit persons to whom the Software is furnished to do
  9. so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in all
  11. copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18. SOFTWARE.
  19. */
  20. #ifndef PY_SSIZE_T_CLEAN
  21. #define PY_SSIZE_T_CLEAN
  22. #endif
  23. #include <Python.h>
  24. #include "setjmp.h"
  25. #include "structmember.h"
  26. static PyObject* EMPTY;
  27. #define PU (Py_UNICODE*)
  28. static const Py_UNICODE* MARKERS[] = {PU"{", PU"}", PU"[", PU"]", PU"<", PU">",
  29. PU"|", PU"=", PU"&", PU"#", PU"*", PU";",
  30. PU":", PU"/", PU"-", PU"!", PU"\n", PU""};
  31. #undef PU
  32. static jmp_buf exception_env;
  33. static const int BAD_ROUTE = 1;
  34. static PyObject* contexts;
  35. static PyObject* tokens;
  36. static PyMethodDef
  37. module_methods[] = {
  38. {NULL}
  39. };
  40. typedef struct {
  41. PyObject_HEAD
  42. PyObject* text; /* text to tokenize */
  43. PyObject* stacks; /* token stacks */
  44. PyObject* topstack; /* topmost stack */
  45. Py_ssize_t head; /* current position in text */
  46. Py_ssize_t length; /* length of text */
  47. Py_ssize_t global; /* global context */
  48. } Tokenizer;
  49. static PyObject*
  50. Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
  51. {
  52. Tokenizer *self;
  53. self = (Tokenizer*) type->tp_alloc(type, 0);
  54. if (self != NULL) {
  55. self->text = Py_None;
  56. Py_INCREF(Py_None);
  57. self->stacks = PyList_New(0);
  58. if (!self->stacks) {
  59. Py_DECREF(self);
  60. return NULL;
  61. }
  62. self->head = 0;
  63. self->length = 0;
  64. self->global = 0;
  65. }
  66. return (PyObject*) self;
  67. }
  68. static void
  69. Tokenizer_dealloc(Tokenizer* self)
  70. {
  71. Py_XDECREF(self->text);
  72. Py_XDECREF(self->stacks);
  73. Py_XDECREF(self->topstack);
  74. self->ob_type->tp_free((PyObject*) self);
  75. }
  76. static int
  77. Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
  78. {
  79. static char* kwlist[] = {NULL};
  80. if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
  81. return -1;
  82. return 0;
  83. }
  84. #define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0)
  85. #define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1)
  86. #define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2)
  87. static int
  88. Tokenizer_set_context(Tokenizer* self, Py_ssize_t value)
  89. {
  90. if (PyList_SetItem(self->topstack, 1, PyInt_FromSsize_t(value)))
  91. return -1;
  92. return 0;
  93. }
  94. static int
  95. Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value)
  96. {
  97. if (PyList_SetItem(self->topstack, 2, value))
  98. return -1;
  99. return 0;
  100. }
  101. /*
  102. Add a new token stack, context, and textbuffer to the list.
  103. */
  104. static int
  105. Tokenizer_push(Tokenizer* self, int context)
  106. {
  107. PyObject* top = PyList_New(3);
  108. PyList_SET_ITEM(top, 0, PyList_New(0));
  109. PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(0));
  110. PyList_SET_ITEM(top, 2, PyList_New(0));
  111. Py_XDECREF(self->topstack);
  112. self->topstack = top;
  113. if (PyList_Append(self->stacks, top))
  114. return -1;
  115. return 0;
  116. }
  117. /*
  118. Push the textbuffer onto the stack as a Text node and clear it.
  119. */
  120. static int
  121. Tokenizer_push_textbuffer(Tokenizer* self)
  122. {
  123. if (PySequence_Fast_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) {
  124. PyObject* text = PyUnicode_Join(EMPTY, Tokenizer_TEXTBUFFER(self));
  125. if (!text) return -1;
  126. PyObject* klass = PyObject_GetAttrString(tokens, "Text");
  127. if (!klass) return -1;
  128. PyObject* args = PyTuple_New(0);
  129. if (!args) return -1;
  130. PyObject* kwargs = PyDict_New();
  131. if (!kwargs) return -1;
  132. PyDict_SetItemString(kwargs, "text", text);
  133. Py_DECREF(text);
  134. PyObject* token = PyInstance_New(klass, args, kwargs);
  135. if (!token) {
  136. Py_DECREF(klass);
  137. Py_DECREF(args);
  138. Py_DECREF(kwargs);
  139. return -1;
  140. }
  141. Py_DECREF(klass);
  142. Py_DECREF(args);
  143. Py_DECREF(kwargs);
  144. if (PyList_Append(Tokenizer_STACK(self), token)) {
  145. Py_XDECREF(token);
  146. return -1;
  147. }
  148. Py_DECREF(token);
  149. if (Tokenizer_set_textbuffer(self, PyList_New(0)))
  150. return -1;
  151. }
  152. return 0;
  153. }
  154. static int
  155. Tokenizer_delete_top_of_stack(Tokenizer* self)
  156. {
  157. if (PySequence_DelItem(self->stacks, -1))
  158. return -1;
  159. Py_DECREF(self->topstack);
  160. Py_ssize_t size = PySequence_Fast_GET_SIZE(self->stacks);
  161. if (size > 0) {
  162. PyObject* top = PySequence_Fast_GET_ITEM(self->stacks, size - 1);
  163. self->topstack = top;
  164. Py_INCREF(top);
  165. }
  166. else {
  167. self->topstack = NULL;
  168. }
  169. return 0;
  170. }
  171. /*
  172. Pop the current stack/context/textbuffer, returing the stack.
  173. */
  174. static PyObject*
  175. Tokenizer_pop(Tokenizer* self)
  176. {
  177. if (Tokenizer_push_textbuffer(self))
  178. return NULL;
  179. PyObject* stack = Tokenizer_STACK(self);
  180. Py_INCREF(stack);
  181. if (Tokenizer_delete_top_of_stack(self))
  182. return NULL;
  183. return stack;
  184. }
  185. /*
  186. Pop the current stack/context/textbuffer, returing the stack. We will also
  187. replace the underlying stack's context with the current stack's.
  188. */
  189. static PyObject*
  190. Tokenizer_pop_keeping_context(Tokenizer* self)
  191. {
  192. if (Tokenizer_push_textbuffer(self))
  193. return NULL;
  194. PyObject* stack = Tokenizer_STACK(self);
  195. PyObject* context = Tokenizer_CONTEXT(self);
  196. Py_INCREF(stack);
  197. Py_INCREF(context);
  198. if (Tokenizer_delete_top_of_stack(self))
  199. return NULL;
  200. if (PyList_SetItem(self->topstack, 1, context))
  201. return NULL;
  202. return stack;
  203. }
  204. /*
  205. Fail the current tokenization route. Discards the current
  206. stack/context/textbuffer and "raises a BAD_ROUTE exception", which is
  207. implemented using longjmp().
  208. */
  209. static void
  210. Tokenizer_fail_route(Tokenizer* self)
  211. {
  212. Tokenizer_pop(self);
  213. longjmp(exception_env, BAD_ROUTE);
  214. }
  215. /*
  216. Write a token to the end of the current token stack.
  217. */
  218. static int
  219. Tokenizer_write(Tokenizer* self, PyObject* token)
  220. {
  221. if (Tokenizer_push_textbuffer(self))
  222. return -1;
  223. if (PyList_Append(Tokenizer_STACK(self), token))
  224. return -1;
  225. return 0;
  226. }
  227. /*
  228. Write a token to the beginning of the current token stack.
  229. */
  230. static int
  231. Tokenizer_write_first(Tokenizer* self, PyObject* token)
  232. {
  233. if (Tokenizer_push_textbuffer(self))
  234. return -1;
  235. if (PyList_Insert(Tokenizer_STACK(self), 0, token))
  236. return -1;
  237. return 0;
  238. }
  239. /*
  240. Write text to the current textbuffer.
  241. */
  242. static int
  243. Tokenizer_write_text(Tokenizer* self, PyObject* text)
  244. {
  245. if (PyList_Append(Tokenizer_TEXTBUFFER(self), text))
  246. return -1;
  247. return 0;
  248. }
  249. /*
  250. Write a series of tokens to the current stack at once.
  251. */
  252. static int
  253. Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
  254. {
  255. if (Tokenizer_push_textbuffer(self))
  256. return -1;
  257. PyObject* stack = Tokenizer_STACK(self);
  258. Py_ssize_t size = PySequence_Fast_GET_SIZE(stack);
  259. if (PyList_SetSlice(stack, size, size, tokenlist))
  260. return -1;
  261. return 0;
  262. }
  263. /*
  264. Pop the current stack, write text, and then write the stack.
  265. */
  266. static int
  267. Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text)
  268. {
  269. PyObject* stack = Tokenizer_pop(self);
  270. if (Tokenizer_write_text(self, text)) {
  271. Py_XDECREF(stack);
  272. return -1;
  273. }
  274. if (stack) {
  275. if (PySequence_Fast_GET_SIZE(stack) > 0) {
  276. if (Tokenizer_write_all(self, stack)) {
  277. Py_DECREF(stack);
  278. return -1;
  279. }
  280. }
  281. Py_DECREF(stack);
  282. }
  283. self->head--;
  284. return 0;
  285. }
  286. /*
  287. Read the value at a relative point in the wikicode.
  288. */
  289. static PyObject*
  290. Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
  291. {
  292. Py_ssize_t index = self->head + delta;
  293. if (index >= self->length) {
  294. return EMPTY;
  295. }
  296. return PySequence_Fast_GET_ITEM(self->text, index);
  297. }
  298. /*
  299. Parse the wikicode string, using *context* for when to stop.
  300. */
  301. static PyObject*
  302. Tokenizer_parse(Tokenizer* self, int context)
  303. {
  304. PyObject* this;
  305. Tokenizer_push(self, context);
  306. while (1) {
  307. this = Tokenizer_read(self, 0);
  308. /* if (this not in MARKERS) {
  309. WRITE TEXT
  310. } */
  311. if (this == EMPTY) {
  312. return Tokenizer_pop(self);
  313. }
  314. self->head++;
  315. }
  316. }
  317. /*
  318. Build a list of tokens from a string of wikicode and return it.
  319. */
  320. static PyObject*
  321. Tokenizer_tokenize(Tokenizer* self, PyObject *args)
  322. {
  323. PyObject* text;
  324. if (!PyArg_ParseTuple(args, "U", &text)) {
  325. /* Failed to parse a Unicode object; try a string instead. */
  326. PyErr_Clear();
  327. const char* encoded;
  328. Py_ssize_t size;
  329. if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) {
  330. return NULL;
  331. }
  332. PyObject* temp;
  333. temp = PyUnicode_FromStringAndSize(encoded, size);
  334. if (!text)
  335. return NULL;
  336. Py_XDECREF(self->text);
  337. text = PySequence_Fast(temp, "expected a sequence");
  338. Py_XDECREF(temp);
  339. self->text = text;
  340. }
  341. else {
  342. Py_XDECREF(self->text);
  343. self->text = PySequence_Fast(text, "expected a sequence");
  344. }
  345. self->length = PySequence_Length(self->text);
  346. return Tokenizer_parse(self, 0);
  347. }
  348. static PyMethodDef
  349. Tokenizer_methods[] = {
  350. {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
  351. "Build a list of tokens from a string of wikicode and return it."},
  352. {NULL}
  353. };
  354. static PyMemberDef
  355. Tokenizer_members[] = {
  356. {NULL}
  357. };
  358. static PyTypeObject
  359. TokenizerType = {
  360. PyObject_HEAD_INIT(NULL)
  361. 0, /* ob_size */
  362. "_tokenizer.CTokenizer", /* tp_name */
  363. sizeof(Tokenizer), /* tp_basicsize */
  364. 0, /* tp_itemsize */
  365. (destructor) Tokenizer_dealloc, /* tp_dealloc */
  366. 0, /* tp_print */
  367. 0, /* tp_getattr */
  368. 0, /* tp_setattr */
  369. 0, /* tp_compare */
  370. 0, /* tp_repr */
  371. 0, /* tp_as_number */
  372. 0, /* tp_as_sequence */
  373. 0, /* tp_as_mapping */
  374. 0, /* tp_hash */
  375. 0, /* tp_call */
  376. 0, /* tp_str */
  377. 0, /* tp_getattro */
  378. 0, /* tp_setattro */
  379. 0, /* tp_as_buffer */
  380. Py_TPFLAGS_DEFAULT, /* tp_flags */
  381. "Creates a list of tokens from a string of wikicode.", /* tp_doc */
  382. 0, /* tp_traverse */
  383. 0, /* tp_clear */
  384. 0, /* tp_richcompare */
  385. 0, /* tp_weaklistoffset */
  386. 0, /* tp_iter */
  387. 0, /* tp_iternext */
  388. Tokenizer_methods, /* tp_methods */
  389. Tokenizer_members, /* tp_members */
  390. 0, /* tp_getset */
  391. 0, /* tp_base */
  392. 0, /* tp_dict */
  393. 0, /* tp_descr_get */
  394. 0, /* tp_descr_set */
  395. 0, /* tp_dictoffset */
  396. (initproc) Tokenizer_init, /* tp_init */
  397. 0, /* tp_alloc */
  398. Tokenizer_new, /* tp_new */
  399. };
  400. PyMODINIT_FUNC
  401. init_tokenizer(void)
  402. {
  403. PyObject* module;
  404. TokenizerType.tp_new = PyType_GenericNew;
  405. if (PyType_Ready(&TokenizerType) < 0)
  406. return;
  407. module = Py_InitModule("_tokenizer", module_methods);
  408. Py_INCREF(&TokenizerType);
  409. PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
  410. EMPTY = PyUnicode_FromString("");
  411. PyObject* globals = PyEval_GetGlobals();
  412. PyObject* locals = PyEval_GetLocals();
  413. PyObject* fromlist = PyList_New(0);
  414. contexts = PyImport_ImportModuleLevel("contexts", globals, locals, fromlist, 1);
  415. tokens = PyImport_ImportModuleLevel("tokens", globals, locals, fromlist, 1);
  416. Py_DECREF(fromlist);
  417. }