A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
 
 
 
 

742 строки
28 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from itertools import takewhile
  24. from math import log
  25. import re
  26. from . import contexts
  27. from . import tokens
  28. from ..compat import htmlentities
  29. from ..tag_defs import is_parsable
  30. __all__ = ["Tokenizer"]
  31. class BadRoute(Exception):
  32. """Raised internally when the current tokenization route is invalid."""
  33. pass
  34. class _TagOpenData(object):
  35. """Stores data about an HTML open tag, like ``<ref name="foo">``."""
  36. CX_NAME = 1 << 0
  37. CX_ATTR_READY = 1 << 1
  38. CX_ATTR_NAME = 1 << 2
  39. CX_ATTR_VALUE = 1 << 3
  40. CX_NEED_SPACE = 1 << 4
  41. CX_NEED_EQUALS = 1 << 5
  42. CX_NEED_QUOTE = 1 << 6
  43. CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE
  44. def __init__(self):
  45. self.context = self.CX_NAME
  46. self.literal = True
  47. self.padding_buffer = []
  48. self.quote_buffer = []
  49. self.reset = 0
  50. self.ignore_quote = False
  51. class Tokenizer(object):
  52. """Creates a list of tokens from a string of wikicode."""
  53. USES_C = False
  54. START = object()
  55. END = object()
  56. MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
  57. "/", "-", "!", "\n", END]
  58. MAX_DEPTH = 40
  59. MAX_CYCLES = 100000
  60. regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
  61. tag_splitter = re.compile(r"([\s\"\\])")
  62. def __init__(self):
  63. self._text = None
  64. self._head = 0
  65. self._stacks = []
  66. self._global = 0
  67. self._depth = 0
  68. self._cycles = 0
  69. @property
  70. def _stack(self):
  71. """The current token stack."""
  72. return self._stacks[-1][0]
  73. @property
  74. def _context(self):
  75. """The current token context."""
  76. return self._stacks[-1][1]
  77. @_context.setter
  78. def _context(self, value):
  79. self._stacks[-1][1] = value
  80. @property
  81. def _textbuffer(self):
  82. """The current textbuffer."""
  83. return self._stacks[-1][2]
  84. @_textbuffer.setter
  85. def _textbuffer(self, value):
  86. self._stacks[-1][2] = value
  87. def _push(self, context=0):
  88. """Add a new token stack, context, and textbuffer to the list."""
  89. self._stacks.append([[], context, []])
  90. self._depth += 1
  91. self._cycles += 1
  92. def _push_textbuffer(self):
  93. """Push the textbuffer onto the stack as a Text node and clear it."""
  94. if self._textbuffer:
  95. self._stack.append(tokens.Text(text="".join(self._textbuffer)))
  96. self._textbuffer = []
  97. def _pop(self, keep_context=False):
  98. """Pop the current stack/context/textbuffer, returing the stack.
  99. If *keep_context* is ``True``, then we will replace the underlying
  100. stack's context with the current stack's.
  101. """
  102. self._push_textbuffer()
  103. self._depth -= 1
  104. if keep_context:
  105. context = self._context
  106. stack = self._stacks.pop()[0]
  107. self._context = context
  108. return stack
  109. return self._stacks.pop()[0]
  110. def _can_recurse(self):
  111. """Return whether or not our max recursion depth has been exceeded."""
  112. return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES
  113. def _fail_route(self):
  114. """Fail the current tokenization route.
  115. Discards the current stack/context/textbuffer and raises
  116. :py:exc:`~.BadRoute`.
  117. """
  118. self._pop()
  119. raise BadRoute()
  120. def _write(self, token):
  121. """Write a token to the end of the current token stack."""
  122. self._push_textbuffer()
  123. self._stack.append(token)
  124. def _write_first(self, token):
  125. """Write a token to the beginning of the current token stack."""
  126. self._push_textbuffer()
  127. self._stack.insert(0, token)
  128. def _write_text(self, text):
  129. """Write text to the current textbuffer."""
  130. self._textbuffer.append(text)
  131. def _write_all(self, tokenlist):
  132. """Write a series of tokens to the current stack at once."""
  133. if tokenlist and isinstance(tokenlist[0], tokens.Text):
  134. self._write_text(tokenlist.pop(0).text)
  135. self._push_textbuffer()
  136. self._stack.extend(tokenlist)
  137. def _write_text_then_stack(self, text):
  138. """Pop the current stack, write *text*, and then write the stack."""
  139. stack = self._pop()
  140. self._write_text(text)
  141. if stack:
  142. self._write_all(stack)
  143. self._head -= 1
  144. def _read(self, delta=0, wrap=False, strict=False):
  145. """Read the value at a relative point in the wikicode.
  146. The value is read from :py:attr:`self._head <_head>` plus the value of
  147. *delta* (which can be negative). If *wrap* is ``False``, we will not
  148. allow attempts to read from the end of the string if ``self._head +
  149. delta`` is negative. If *strict* is ``True``, the route will be failed
  150. (with :py:meth:`_fail_route`) if we try to read from past the end of
  151. the string; otherwise, :py:attr:`self.END <END>` is returned. If we try
  152. to read from before the start of the string, :py:attr:`self.START
  153. <START>` is returned.
  154. """
  155. index = self._head + delta
  156. if index < 0 and (not wrap or abs(index) > len(self._text)):
  157. return self.START
  158. try:
  159. return self._text[index]
  160. except IndexError:
  161. if strict:
  162. self._fail_route()
  163. return self.END
  164. def _parse_template_or_argument(self):
  165. """Parse a template or argument at the head of the wikicode string."""
  166. self._head += 2
  167. braces = 2
  168. while self._read() == "{":
  169. self._head += 1
  170. braces += 1
  171. self._push()
  172. while braces:
  173. if braces == 1:
  174. return self._write_text_then_stack("{")
  175. if braces == 2:
  176. try:
  177. self._parse_template()
  178. except BadRoute:
  179. return self._write_text_then_stack("{{")
  180. break
  181. try:
  182. self._parse_argument()
  183. braces -= 3
  184. except BadRoute:
  185. try:
  186. self._parse_template()
  187. braces -= 2
  188. except BadRoute:
  189. return self._write_text_then_stack("{" * braces)
  190. if braces:
  191. self._head += 1
  192. self._write_all(self._pop())
  193. def _parse_template(self):
  194. """Parse a template at the head of the wikicode string."""
  195. reset = self._head
  196. try:
  197. template = self._parse(contexts.TEMPLATE_NAME)
  198. except BadRoute:
  199. self._head = reset
  200. raise
  201. self._write_first(tokens.TemplateOpen())
  202. self._write_all(template)
  203. self._write(tokens.TemplateClose())
  204. def _parse_argument(self):
  205. """Parse an argument at the head of the wikicode string."""
  206. reset = self._head
  207. try:
  208. argument = self._parse(contexts.ARGUMENT_NAME)
  209. except BadRoute:
  210. self._head = reset
  211. raise
  212. self._write_first(tokens.ArgumentOpen())
  213. self._write_all(argument)
  214. self._write(tokens.ArgumentClose())
  215. def _handle_template_param(self):
  216. """Handle a template parameter at the head of the string."""
  217. if self._context & contexts.TEMPLATE_NAME:
  218. self._context ^= contexts.TEMPLATE_NAME
  219. elif self._context & contexts.TEMPLATE_PARAM_VALUE:
  220. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  221. elif self._context & contexts.TEMPLATE_PARAM_KEY:
  222. self._write_all(self._pop(keep_context=True))
  223. self._context |= contexts.TEMPLATE_PARAM_KEY
  224. self._write(tokens.TemplateParamSeparator())
  225. self._push(self._context)
  226. def _handle_template_param_value(self):
  227. """Handle a template parameter's value at the head of the string."""
  228. self._write_all(self._pop(keep_context=True))
  229. self._context ^= contexts.TEMPLATE_PARAM_KEY
  230. self._context |= contexts.TEMPLATE_PARAM_VALUE
  231. self._write(tokens.TemplateParamEquals())
  232. def _handle_template_end(self):
  233. """Handle the end of a template at the head of the string."""
  234. if self._context & contexts.TEMPLATE_PARAM_KEY:
  235. self._write_all(self._pop(keep_context=True))
  236. self._head += 1
  237. return self._pop()
  238. def _handle_argument_separator(self):
  239. """Handle the separator between an argument's name and default."""
  240. self._context ^= contexts.ARGUMENT_NAME
  241. self._context |= contexts.ARGUMENT_DEFAULT
  242. self._write(tokens.ArgumentSeparator())
  243. def _handle_argument_end(self):
  244. """Handle the end of an argument at the head of the string."""
  245. self._head += 2
  246. return self._pop()
  247. def _parse_wikilink(self):
  248. """Parse an internal wikilink at the head of the wikicode string."""
  249. self._head += 2
  250. reset = self._head - 1
  251. try:
  252. wikilink = self._parse(contexts.WIKILINK_TITLE)
  253. except BadRoute:
  254. self._head = reset
  255. self._write_text("[[")
  256. else:
  257. self._write(tokens.WikilinkOpen())
  258. self._write_all(wikilink)
  259. self._write(tokens.WikilinkClose())
  260. def _handle_wikilink_separator(self):
  261. """Handle the separator between a wikilink's title and its text."""
  262. self._context ^= contexts.WIKILINK_TITLE
  263. self._context |= contexts.WIKILINK_TEXT
  264. self._write(tokens.WikilinkSeparator())
  265. def _handle_wikilink_end(self):
  266. """Handle the end of a wikilink at the head of the string."""
  267. self._head += 1
  268. return self._pop()
  269. def _parse_heading(self):
  270. """Parse a section heading at the head of the wikicode string."""
  271. self._global |= contexts.GL_HEADING
  272. reset = self._head
  273. self._head += 1
  274. best = 1
  275. while self._read() == "=":
  276. best += 1
  277. self._head += 1
  278. context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
  279. try:
  280. title, level = self._parse(context)
  281. except BadRoute:
  282. self._head = reset + best - 1
  283. self._write_text("=" * best)
  284. else:
  285. self._write(tokens.HeadingStart(level=level))
  286. if level < best:
  287. self._write_text("=" * (best - level))
  288. self._write_all(title)
  289. self._write(tokens.HeadingEnd())
  290. finally:
  291. self._global ^= contexts.GL_HEADING
  292. def _handle_heading_end(self):
  293. """Handle the end of a section heading at the head of the string."""
  294. reset = self._head
  295. self._head += 1
  296. best = 1
  297. while self._read() == "=":
  298. best += 1
  299. self._head += 1
  300. current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
  301. level = min(current, min(best, 6))
  302. try: # Try to check for a heading closure after this one
  303. after, after_level = self._parse(self._context)
  304. except BadRoute:
  305. if level < best:
  306. self._write_text("=" * (best - level))
  307. self._head = reset + best - 1
  308. return self._pop(), level
  309. else: # Found another closure
  310. self._write_text("=" * best)
  311. self._write_all(after)
  312. return self._pop(), after_level
  313. def _really_parse_entity(self):
  314. """Actually parse an HTML entity and ensure that it is valid."""
  315. self._write(tokens.HTMLEntityStart())
  316. self._head += 1
  317. this = self._read(strict=True)
  318. if this == "#":
  319. numeric = True
  320. self._write(tokens.HTMLEntityNumeric())
  321. self._head += 1
  322. this = self._read(strict=True)
  323. if this[0].lower() == "x":
  324. hexadecimal = True
  325. self._write(tokens.HTMLEntityHex(char=this[0]))
  326. this = this[1:]
  327. if not this:
  328. self._fail_route()
  329. else:
  330. hexadecimal = False
  331. else:
  332. numeric = hexadecimal = False
  333. valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789"
  334. if not numeric and not hexadecimal:
  335. valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  336. if not all([char in valid for char in this]):
  337. self._fail_route()
  338. self._head += 1
  339. if self._read() != ";":
  340. self._fail_route()
  341. if numeric:
  342. test = int(this, 16) if hexadecimal else int(this)
  343. if test < 1 or test > 0x10FFFF:
  344. self._fail_route()
  345. else:
  346. if this not in htmlentities.entitydefs:
  347. self._fail_route()
  348. self._write(tokens.Text(text=this))
  349. self._write(tokens.HTMLEntityEnd())
  350. def _parse_entity(self):
  351. """Parse an HTML entity at the head of the wikicode string."""
  352. reset = self._head
  353. self._push()
  354. try:
  355. self._really_parse_entity()
  356. except BadRoute:
  357. self._head = reset
  358. self._write_text(self._read())
  359. else:
  360. self._write_all(self._pop())
  361. def _parse_comment(self):
  362. """Parse an HTML comment at the head of the wikicode string."""
  363. self._head += 4
  364. reset = self._head - 1
  365. try:
  366. comment = self._parse(contexts.COMMENT)
  367. except BadRoute:
  368. self._head = reset
  369. self._write_text("<!--")
  370. else:
  371. self._write(tokens.CommentStart())
  372. self._write_all(comment)
  373. self._write(tokens.CommentEnd())
  374. self._head += 2
  375. def _parse_tag(self):
  376. """Parse an HTML tag at the head of the wikicode string."""
  377. reset = self._head
  378. self._head += 1
  379. try:
  380. tokens = self._really_parse_tag()
  381. except BadRoute:
  382. self._head = reset
  383. self._write_text("<")
  384. else:
  385. self._write_all(tokens)
  386. def _really_parse_tag(self):
  387. """Actually parse an HTML tag, starting with the open (``<foo>``)."""
  388. data = _TagOpenData()
  389. self._push(contexts.TAG_OPEN)
  390. self._write(tokens.TagOpenOpen(showtag=True))
  391. while True:
  392. this, next = self._read(), self._read(1)
  393. if this not in self.MARKERS:
  394. for chunk in self.tag_splitter.split(this):
  395. if self._handle_tag_chunk(data, chunk):
  396. continue
  397. elif this is self.END:
  398. if self._context & contexts.TAG_ATTR:
  399. self._pop()
  400. self._fail_route()
  401. elif this == ">" and data.literal:
  402. if data.context & data.CX_ATTR:
  403. self._push_tag_buffer(data)
  404. padding = data.padding_buffer[0] if data.padding_buffer else ""
  405. self._write(tokens.TagCloseOpen(padding=padding))
  406. self._context = contexts.TAG_BODY
  407. self._head += 1
  408. return self._parse(push=False)
  409. elif this == "/" and next == ">" and data.literal:
  410. if data.context & data.CX_ATTR:
  411. self._push_tag_buffer(data)
  412. padding = data.padding_buffer[0] if data.padding_buffer else ""
  413. self._write(tokens.TagCloseSelfclose(padding=padding))
  414. self._head += 1
  415. return self._pop()
  416. else:
  417. for chunk in self.tag_splitter.split(this):
  418. if self._handle_tag_chunk(data, chunk):
  419. continue
  420. self._head += 1
  421. def _handle_tag_chunk(self, data, chunk):
  422. if not chunk:
  423. return
  424. if data.context & data.CX_NAME:
  425. if chunk != chunk.lstrip(): # Tags cannot start with whitespace
  426. self._fail_route()
  427. self._write_text(chunk)
  428. data.context = data.CX_NEED_SPACE
  429. elif data.context & data.CX_NEED_SPACE:
  430. if chunk.isspace():
  431. if data.context & data.CX_ATTR_VALUE:
  432. self._push_tag_buffer(data)
  433. data.padding_buffer.append(chunk)
  434. data.context = data.CX_ATTR_READY
  435. else:
  436. if data.context & data.CX_ATTR_VALUE:
  437. data.context ^= data.CX_NEED_SPACE
  438. data.quote_buffer = []
  439. data.ignore_quote = True
  440. self._head = data.reset
  441. return True # Break out of chunk processing early
  442. else:
  443. self._fail_route()
  444. elif data.context & data.CX_ATTR_READY:
  445. if chunk.isspace():
  446. data.padding_buffer.append(chunk)
  447. else:
  448. data.context = data.CX_ATTR_NAME
  449. self._push(contexts.TAG_ATTR)
  450. self._write_text(chunk) ### hook on here for {, <, etc
  451. elif data.context & data.CX_ATTR_NAME:
  452. if chunk.isspace():
  453. data.padding_buffer.append(chunk)
  454. data.context |= data.CX_NEED_EQUALS
  455. elif chunk == "=":
  456. if not data.context & data.CX_NEED_EQUALS:
  457. data.padding_buffer.append("") # No padding before equals
  458. data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE
  459. self._write(tokens.TagAttrEquals())
  460. else:
  461. if data.context & data.CX_NEED_EQUALS:
  462. self._push_tag_buffer(data)
  463. data.padding_buffer.append("") # No padding before tag
  464. data.context = data.CX_ATTR_NAME
  465. self._push(contexts.TAG_ATTR)
  466. self._write_text(chunk) ### hook on here for {, <, etc
  467. elif data.context & data.CX_ATTR_VALUE:
  468. ### handle backslashes here
  469. if data.context & data.CX_NEED_QUOTE:
  470. if chunk == '"' and not data.ignore_quote:
  471. data.context ^= data.CX_NEED_QUOTE
  472. data.literal = False
  473. data.reset = self._head
  474. elif chunk.isspace():
  475. data.padding_buffer.append(chunk)
  476. else:
  477. data.context ^= data.CX_NEED_QUOTE
  478. self._write_text(chunk) ### hook on here for {, <, etc
  479. elif not data.literal:
  480. if chunk == '"':
  481. data.context |= data.CX_NEED_SPACE
  482. data.literal = True
  483. else:
  484. data.quote_buffer.append(chunk)
  485. elif chunk.isspace():
  486. self._push_tag_buffer(data)
  487. data.padding_buffer.append(chunk)
  488. data.context = data.CX_ATTR_READY
  489. else:
  490. self._write_text(chunk) ### hook on here for {, <, etc
  491. def _push_tag_buffer(self, data):
  492. buf = data.padding_buffer
  493. while len(buf) < 3:
  494. buf.append("")
  495. self._write_first(tokens.TagAttrStart(
  496. pad_after_eq=buf.pop(), pad_before_eq=buf.pop(),
  497. pad_first=buf.pop()))
  498. if data.quote_buffer:
  499. self._write(tokens.TagAttrQuote())
  500. self._write_text("".join(data.quote_buffer))
  501. self._write_all(self._pop())
  502. data.padding_buffer, data.quote_buffer = [], []
  503. data.ignore_quote = False
  504. def _get_tag_from_stack(self, stack=None):
  505. """Return the tag based on the text in *stack*."""
  506. if not stack:
  507. sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen)
  508. pred = lambda tok: not isinstance(tok, sentinels)
  509. stack = takewhile(pred, self._stack)
  510. text = [tok.text for tok in stack if isinstance(tok, tokens.Text)]
  511. try:
  512. return "".join(text).rstrip().lower().split()[0]
  513. except IndexError:
  514. self._fail_route()
  515. def _handle_tag_open_close(self):
  516. """Handle the opening of a closing tag (``</foo>``)."""
  517. self._write(tokens.TagOpenClose())
  518. self._push(contexts.TAG_CLOSE)
  519. self._head += 1
  520. def _handle_tag_close_close(self):
  521. """Handle the ending of a closing tag (``</foo>``)."""
  522. closing = self._pop()
  523. if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
  524. self._fail_route()
  525. self._write_all(closing)
  526. self._write(tokens.TagCloseClose())
  527. return self._pop()
  528. def _verify_safe(self, this):
  529. """Make sure we are not trying to write an invalid character."""
  530. context = self._context
  531. if context & contexts.FAIL_NEXT:
  532. return False
  533. if context & contexts.WIKILINK_TITLE:
  534. if this == "]" or this == "{":
  535. self._context |= contexts.FAIL_NEXT
  536. elif this == "\n" or this == "[" or this == "}":
  537. return False
  538. return True
  539. elif context & contexts.TEMPLATE_NAME:
  540. if this == "{" or this == "}" or this == "[":
  541. self._context |= contexts.FAIL_NEXT
  542. return True
  543. if this == "]":
  544. return False
  545. if this == "|":
  546. return True
  547. if context & contexts.HAS_TEXT:
  548. if context & contexts.FAIL_ON_TEXT:
  549. if this is self.END or not this.isspace():
  550. return False
  551. else:
  552. if this == "\n":
  553. self._context |= contexts.FAIL_ON_TEXT
  554. elif this is self.END or not this.isspace():
  555. self._context |= contexts.HAS_TEXT
  556. return True
  557. elif context & contexts.TAG_CLOSE:
  558. return this != "<" and this != "\n"
  559. else:
  560. if context & contexts.FAIL_ON_EQUALS:
  561. if this == "=":
  562. return False
  563. elif context & contexts.FAIL_ON_LBRACE:
  564. if this == "{" or (self._read(-1) == self._read(-2) == "{"):
  565. if context & contexts.TEMPLATE:
  566. self._context |= contexts.FAIL_ON_EQUALS
  567. else:
  568. self._context |= contexts.FAIL_NEXT
  569. return True
  570. self._context ^= contexts.FAIL_ON_LBRACE
  571. elif context & contexts.FAIL_ON_RBRACE:
  572. if this == "}":
  573. if context & contexts.TEMPLATE:
  574. self._context |= contexts.FAIL_ON_EQUALS
  575. else:
  576. self._context |= contexts.FAIL_NEXT
  577. return True
  578. self._context ^= contexts.FAIL_ON_RBRACE
  579. elif this == "{":
  580. self._context |= contexts.FAIL_ON_LBRACE
  581. elif this == "}":
  582. self._context |= contexts.FAIL_ON_RBRACE
  583. return True
  584. def _parse(self, context=0, push=True):
  585. """Parse the wikicode string, using *context* for when to stop."""
  586. unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
  587. contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
  588. contexts.TAG_CLOSE)
  589. fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
  590. contexts.HEADING | contexts.COMMENT | contexts.TAG)
  591. double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
  592. if push:
  593. self._push(context)
  594. while True:
  595. this = self._read()
  596. if self._context & unsafe:
  597. if not self._verify_safe(this):
  598. if self._context & double_fail:
  599. self._pop()
  600. self._fail_route()
  601. if this not in self.MARKERS:
  602. self._write_text(this)
  603. self._head += 1
  604. continue
  605. if this is self.END:
  606. if self._context & fail:
  607. if self._context & double_fail:
  608. self._pop()
  609. self._fail_route()
  610. return self._pop()
  611. next = self._read(1)
  612. if self._context & contexts.COMMENT:
  613. if this == next == "-" and self._read(2) == ">":
  614. return self._pop()
  615. else:
  616. self._write_text(this)
  617. elif this == next == "{":
  618. if self._can_recurse():
  619. self._parse_template_or_argument()
  620. if self._context & contexts.FAIL_NEXT:
  621. self._context ^= contexts.FAIL_NEXT
  622. else:
  623. self._write_text("{")
  624. elif this == "|" and self._context & contexts.TEMPLATE:
  625. self._handle_template_param()
  626. elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
  627. self._handle_template_param_value()
  628. elif this == next == "}" and self._context & contexts.TEMPLATE:
  629. return self._handle_template_end()
  630. elif this == "|" and self._context & contexts.ARGUMENT_NAME:
  631. self._handle_argument_separator()
  632. elif this == next == "}" and self._context & contexts.ARGUMENT:
  633. if self._read(2) == "}":
  634. return self._handle_argument_end()
  635. else:
  636. self._write_text("}")
  637. elif this == next == "[":
  638. if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
  639. self._parse_wikilink()
  640. if self._context & contexts.FAIL_NEXT:
  641. self._context ^= contexts.FAIL_NEXT
  642. else:
  643. self._write_text("[")
  644. elif this == "|" and self._context & contexts.WIKILINK_TITLE:
  645. self._handle_wikilink_separator()
  646. elif this == next == "]" and self._context & contexts.WIKILINK:
  647. return self._handle_wikilink_end()
  648. elif this == "=" and not self._global & contexts.GL_HEADING:
  649. if self._read(-1) in ("\n", self.START):
  650. self._parse_heading()
  651. else:
  652. self._write_text("=")
  653. elif this == "=" and self._context & contexts.HEADING:
  654. return self._handle_heading_end()
  655. elif this == "\n" and self._context & contexts.HEADING:
  656. self._fail_route()
  657. elif this == "&":
  658. self._parse_entity()
  659. elif this == "<" and next == "!":
  660. if self._read(2) == self._read(3) == "-":
  661. self._parse_comment()
  662. else:
  663. self._write_text(this)
  664. elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE:
  665. self._parse_tag()
  666. elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
  667. self._handle_tag_open_close()
  668. elif this == ">" and self._context & contexts.TAG_CLOSE:
  669. return self._handle_tag_close_close()
  670. else:
  671. self._write_text(this)
  672. self._head += 1
  673. def tokenize(self, text):
  674. """Build a list of tokens from a string of wikicode and return it."""
  675. split = self.regex.split(text)
  676. self._text = [segment for segment in split if segment]
  677. return self._parse()