A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

767 lines
29 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from itertools import takewhile
  24. from math import log
  25. import re
  26. from . import contexts
  27. from . import tokens
  28. from ..compat import htmlentities
  29. from ..tag_defs import is_parsable
  30. __all__ = ["Tokenizer"]
  31. class BadRoute(Exception):
  32. """Raised internally when the current tokenization route is invalid."""
  33. pass
  34. class Tokenizer(object):
  35. """Creates a list of tokens from a string of wikicode."""
  36. USES_C = False
  37. START = object()
  38. END = object()
  39. MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
  40. "/", "-", "!", "\n", END]
  41. MAX_DEPTH = 40
  42. MAX_CYCLES = 100000
  43. regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
  44. def __init__(self):
  45. self._text = None
  46. self._head = 0
  47. self._stacks = []
  48. self._global = 0
  49. self._depth = 0
  50. self._cycles = 0
  51. @property
  52. def _stack(self):
  53. """The current token stack."""
  54. return self._stacks[-1][0]
  55. @property
  56. def _context(self):
  57. """The current token context."""
  58. return self._stacks[-1][1]
  59. @_context.setter
  60. def _context(self, value):
  61. self._stacks[-1][1] = value
  62. @property
  63. def _textbuffer(self):
  64. """The current textbuffer."""
  65. return self._stacks[-1][2]
  66. @_textbuffer.setter
  67. def _textbuffer(self, value):
  68. self._stacks[-1][2] = value
  69. def _push(self, context=0):
  70. """Add a new token stack, context, and textbuffer to the list."""
  71. self._stacks.append([[], context, []])
  72. self._depth += 1
  73. self._cycles += 1
  74. def _push_textbuffer(self):
  75. """Push the textbuffer onto the stack as a Text node and clear it."""
  76. if self._textbuffer:
  77. self._stack.append(tokens.Text(text="".join(self._textbuffer)))
  78. self._textbuffer = []
  79. def _pop(self, keep_context=False):
  80. """Pop the current stack/context/textbuffer, returing the stack.
  81. If *keep_context* is ``True``, then we will replace the underlying
  82. stack's context with the current stack's.
  83. """
  84. self._push_textbuffer()
  85. self._depth -= 1
  86. if keep_context:
  87. context = self._context
  88. stack = self._stacks.pop()[0]
  89. self._context = context
  90. return stack
  91. return self._stacks.pop()[0]
  92. def _can_recurse(self):
  93. """Return whether or not our max recursion depth has been exceeded."""
  94. return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES
  95. def _fail_route(self):
  96. """Fail the current tokenization route.
  97. Discards the current stack/context/textbuffer and raises
  98. :py:exc:`~.BadRoute`.
  99. """
  100. self._pop()
  101. raise BadRoute()
  102. def _write(self, token):
  103. """Write a token to the end of the current token stack."""
  104. self._push_textbuffer()
  105. self._stack.append(token)
  106. def _write_first(self, token):
  107. """Write a token to the beginning of the current token stack."""
  108. self._push_textbuffer()
  109. self._stack.insert(0, token)
  110. def _write_text(self, text):
  111. """Write text to the current textbuffer."""
  112. self._textbuffer.append(text)
  113. def _write_all(self, tokenlist):
  114. """Write a series of tokens to the current stack at once."""
  115. if tokenlist and isinstance(tokenlist[0], tokens.Text):
  116. self._write_text(tokenlist.pop(0).text)
  117. self._push_textbuffer()
  118. self._stack.extend(tokenlist)
  119. def _write_text_then_stack(self, text):
  120. """Pop the current stack, write *text*, and then write the stack."""
  121. stack = self._pop()
  122. self._write_text(text)
  123. if stack:
  124. self._write_all(stack)
  125. self._head -= 1
  126. def _read(self, delta=0, wrap=False, strict=False):
  127. """Read the value at a relative point in the wikicode.
  128. The value is read from :py:attr:`self._head <_head>` plus the value of
  129. *delta* (which can be negative). If *wrap* is ``False``, we will not
  130. allow attempts to read from the end of the string if ``self._head +
  131. delta`` is negative. If *strict* is ``True``, the route will be failed
  132. (with :py:meth:`_fail_route`) if we try to read from past the end of
  133. the string; otherwise, :py:attr:`self.END <END>` is returned. If we try
  134. to read from before the start of the string, :py:attr:`self.START
  135. <START>` is returned.
  136. """
  137. index = self._head + delta
  138. if index < 0 and (not wrap or abs(index) > len(self._text)):
  139. return self.START
  140. try:
  141. return self._text[index]
  142. except IndexError:
  143. if strict:
  144. self._fail_route()
  145. return self.END
  146. def _parse_template_or_argument(self):
  147. """Parse a template or argument at the head of the wikicode string."""
  148. self._head += 2
  149. braces = 2
  150. while self._read() == "{":
  151. self._head += 1
  152. braces += 1
  153. self._push()
  154. while braces:
  155. if braces == 1:
  156. return self._write_text_then_stack("{")
  157. if braces == 2:
  158. try:
  159. self._parse_template()
  160. except BadRoute:
  161. return self._write_text_then_stack("{{")
  162. break
  163. try:
  164. self._parse_argument()
  165. braces -= 3
  166. except BadRoute:
  167. try:
  168. self._parse_template()
  169. braces -= 2
  170. except BadRoute:
  171. return self._write_text_then_stack("{" * braces)
  172. if braces:
  173. self._head += 1
  174. self._write_all(self._pop())
  175. def _parse_template(self):
  176. """Parse a template at the head of the wikicode string."""
  177. reset = self._head
  178. try:
  179. template = self._parse(contexts.TEMPLATE_NAME)
  180. except BadRoute:
  181. self._head = reset
  182. raise
  183. self._write_first(tokens.TemplateOpen())
  184. self._write_all(template)
  185. self._write(tokens.TemplateClose())
  186. def _parse_argument(self):
  187. """Parse an argument at the head of the wikicode string."""
  188. reset = self._head
  189. try:
  190. argument = self._parse(contexts.ARGUMENT_NAME)
  191. except BadRoute:
  192. self._head = reset
  193. raise
  194. self._write_first(tokens.ArgumentOpen())
  195. self._write_all(argument)
  196. self._write(tokens.ArgumentClose())
  197. def _handle_template_param(self):
  198. """Handle a template parameter at the head of the string."""
  199. if self._context & contexts.TEMPLATE_NAME:
  200. self._context ^= contexts.TEMPLATE_NAME
  201. elif self._context & contexts.TEMPLATE_PARAM_VALUE:
  202. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  203. elif self._context & contexts.TEMPLATE_PARAM_KEY:
  204. self._write_all(self._pop(keep_context=True))
  205. self._context |= contexts.TEMPLATE_PARAM_KEY
  206. self._write(tokens.TemplateParamSeparator())
  207. self._push(self._context)
  208. def _handle_template_param_value(self):
  209. """Handle a template parameter's value at the head of the string."""
  210. self._write_all(self._pop(keep_context=True))
  211. self._context ^= contexts.TEMPLATE_PARAM_KEY
  212. self._context |= contexts.TEMPLATE_PARAM_VALUE
  213. self._write(tokens.TemplateParamEquals())
  214. def _handle_template_end(self):
  215. """Handle the end of a template at the head of the string."""
  216. if self._context & contexts.TEMPLATE_PARAM_KEY:
  217. self._write_all(self._pop(keep_context=True))
  218. self._head += 1
  219. return self._pop()
  220. def _handle_argument_separator(self):
  221. """Handle the separator between an argument's name and default."""
  222. self._context ^= contexts.ARGUMENT_NAME
  223. self._context |= contexts.ARGUMENT_DEFAULT
  224. self._write(tokens.ArgumentSeparator())
  225. def _handle_argument_end(self):
  226. """Handle the end of an argument at the head of the string."""
  227. self._head += 2
  228. return self._pop()
  229. def _parse_wikilink(self):
  230. """Parse an internal wikilink at the head of the wikicode string."""
  231. self._head += 2
  232. reset = self._head - 1
  233. try:
  234. wikilink = self._parse(contexts.WIKILINK_TITLE)
  235. except BadRoute:
  236. self._head = reset
  237. self._write_text("[[")
  238. else:
  239. self._write(tokens.WikilinkOpen())
  240. self._write_all(wikilink)
  241. self._write(tokens.WikilinkClose())
  242. def _handle_wikilink_separator(self):
  243. """Handle the separator between a wikilink's title and its text."""
  244. self._context ^= contexts.WIKILINK_TITLE
  245. self._context |= contexts.WIKILINK_TEXT
  246. self._write(tokens.WikilinkSeparator())
  247. def _handle_wikilink_end(self):
  248. """Handle the end of a wikilink at the head of the string."""
  249. self._head += 1
  250. return self._pop()
  251. def _parse_heading(self):
  252. """Parse a section heading at the head of the wikicode string."""
  253. self._global |= contexts.GL_HEADING
  254. reset = self._head
  255. self._head += 1
  256. best = 1
  257. while self._read() == "=":
  258. best += 1
  259. self._head += 1
  260. context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
  261. try:
  262. title, level = self._parse(context)
  263. except BadRoute:
  264. self._head = reset + best - 1
  265. self._write_text("=" * best)
  266. else:
  267. self._write(tokens.HeadingStart(level=level))
  268. if level < best:
  269. self._write_text("=" * (best - level))
  270. self._write_all(title)
  271. self._write(tokens.HeadingEnd())
  272. finally:
  273. self._global ^= contexts.GL_HEADING
  274. def _handle_heading_end(self):
  275. """Handle the end of a section heading at the head of the string."""
  276. reset = self._head
  277. self._head += 1
  278. best = 1
  279. while self._read() == "=":
  280. best += 1
  281. self._head += 1
  282. current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
  283. level = min(current, min(best, 6))
  284. try: # Try to check for a heading closure after this one
  285. after, after_level = self._parse(self._context)
  286. except BadRoute:
  287. if level < best:
  288. self._write_text("=" * (best - level))
  289. self._head = reset + best - 1
  290. return self._pop(), level
  291. else: # Found another closure
  292. self._write_text("=" * best)
  293. self._write_all(after)
  294. return self._pop(), after_level
  295. def _really_parse_entity(self):
  296. """Actually parse an HTML entity and ensure that it is valid."""
  297. self._write(tokens.HTMLEntityStart())
  298. self._head += 1
  299. this = self._read(strict=True)
  300. if this == "#":
  301. numeric = True
  302. self._write(tokens.HTMLEntityNumeric())
  303. self._head += 1
  304. this = self._read(strict=True)
  305. if this[0].lower() == "x":
  306. hexadecimal = True
  307. self._write(tokens.HTMLEntityHex(char=this[0]))
  308. this = this[1:]
  309. if not this:
  310. self._fail_route()
  311. else:
  312. hexadecimal = False
  313. else:
  314. numeric = hexadecimal = False
  315. valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789"
  316. if not numeric and not hexadecimal:
  317. valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  318. if not all([char in valid for char in this]):
  319. self._fail_route()
  320. self._head += 1
  321. if self._read() != ";":
  322. self._fail_route()
  323. if numeric:
  324. test = int(this, 16) if hexadecimal else int(this)
  325. if test < 1 or test > 0x10FFFF:
  326. self._fail_route()
  327. else:
  328. if this not in htmlentities.entitydefs:
  329. self._fail_route()
  330. self._write(tokens.Text(text=this))
  331. self._write(tokens.HTMLEntityEnd())
  332. def _parse_entity(self):
  333. """Parse an HTML entity at the head of the wikicode string."""
  334. reset = self._head
  335. self._push()
  336. try:
  337. self._really_parse_entity()
  338. except BadRoute:
  339. self._head = reset
  340. self._write_text(self._read())
  341. else:
  342. self._write_all(self._pop())
  343. def _parse_comment(self):
  344. """Parse an HTML comment at the head of the wikicode string."""
  345. self._head += 4
  346. reset = self._head - 1
  347. try:
  348. comment = self._parse(contexts.COMMENT)
  349. except BadRoute:
  350. self._head = reset
  351. self._write_text("<!--")
  352. else:
  353. self._write(tokens.CommentStart())
  354. self._write_all(comment)
  355. self._write(tokens.CommentEnd())
  356. self._head += 2
  357. def _parse_tag(self):
  358. """Parse an HTML tag at the head of the wikicode string."""
  359. reset = self._head
  360. self._head += 1
  361. try:
  362. tokens = self._parse(contexts.TAG_OPEN_NAME)
  363. except BadRoute:
  364. self._head = reset
  365. self._write_text("<")
  366. else:
  367. self._write_all(tokens)
  368. def _actually_close_tag_opening(self):
  369. """Handle cleanup at the end of a opening tag.
  370. The current context will be updated and the
  371. :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the
  372. opening tag's padding to be used in the
  373. :py:class:`~.tokens.TagOpenClose` token.
  374. """
  375. if self._context & contexts.TAG_OPEN_ATTR:
  376. if self._context & contexts.TAG_OPEN_ATTR_NAME:
  377. self._context ^= contexts.TAG_OPEN_ATTR_NAME
  378. if self._context & contexts.TAG_OPEN_ATTR_BODY:
  379. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  380. else:
  381. self._write_first(tokens.TagOpenOpen(showtag=True))
  382. self._context ^= contexts.TAG_OPEN_NAME
  383. self._context |= contexts.TAG_BODY
  384. self._push_textbuffer()
  385. if isinstance(self._stack[-1], tokens.TagAttrStart):
  386. return self._stack.pop().padding
  387. return ""
  388. def _actually_handle_chunk(self, chunks, is_new):
  389. """Actually handle a chunk of code within a tag's attributes.
  390. Called by :py:meth:`_handle_tag_chunk` and
  391. :py:meth:`_handle_tag_attribute_body`.
  392. """
  393. if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  394. padding = 0
  395. while chunks:
  396. if chunks[0] == "":
  397. padding += 1
  398. chunks.pop(0)
  399. else:
  400. break
  401. self._write(tokens.TagAttrStart(padding=" " * padding))
  402. elif self._context & contexts.TAG_OPEN_ATTR_IGNORE:
  403. self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
  404. chunks.pop(0)
  405. return
  406. elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  407. self._write_text(" ") # Quoted chunks don't lose their spaces
  408. if chunks:
  409. chunk = chunks.pop(0)
  410. if self._context & contexts.TAG_OPEN_ATTR_BODY:
  411. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  412. self._context |= contexts.TAG_OPEN_ATTR_NAME
  413. if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  414. if re.search(r'[^\\]"', chunk[:-1]):
  415. self._fail_route()
  416. if re.search(r'[^\\]"$', chunk):
  417. self._write_text(chunk[:-1])
  418. self._context ^= contexts.TAG_OPEN_ATTR_QUOTED
  419. self._context |= contexts.TAG_OPEN_ATTR_NAME
  420. return True # Back to _handle_tag_attribute_body()
  421. self._write_text(chunk)
  422. def _handle_tag_chunk(self, text):
  423. """Handle a chunk of code within a tag's attributes.
  424. This is called by :py:meth:`_parse`, which intercepts parsing of
  425. wikicode when we're inside of an opening tag and no :py:attr:`MARKERS`
  426. are present.
  427. """
  428. if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  429. self._write_text(text)
  430. return
  431. chunks = text.split(" ")
  432. is_new = False
  433. is_quoted = False
  434. if self._context & contexts.TAG_OPEN_NAME:
  435. self._write_text(chunks.pop(0))
  436. self._write_first(tokens.TagOpenOpen(showtag=True))
  437. self._context ^= contexts.TAG_OPEN_NAME
  438. self._context |= contexts.TAG_OPEN_ATTR_NAME
  439. self._actually_handle_chunk(chunks, True)
  440. is_new = True
  441. while chunks:
  442. result = self._actually_handle_chunk(chunks, is_new)
  443. is_quoted = result or is_quoted
  444. is_new = True
  445. if is_quoted:
  446. return self._pop()
  447. def _handle_tag_attribute_body(self):
  448. """Handle the body, or value, of a tag attribute.
  449. Attribute bodies can usually be handled at once, but sometimes a new
  450. stack must be created to keep track of "rich" attribute values that
  451. contain, for example, templates.
  452. """
  453. self._context ^= contexts.TAG_OPEN_ATTR_NAME
  454. self._context |= contexts.TAG_OPEN_ATTR_BODY
  455. self._write(tokens.TagAttrEquals())
  456. next = self._read(1)
  457. if next not in self.MARKERS and next.startswith('"'):
  458. chunks = None
  459. if " " in next:
  460. chunks = next.split(" ")
  461. next = chunks.pop(0)
  462. if re.search(r'[^\\]"$', next[1:]):
  463. if not re.search(r'[^\\]"', next[1:-1]):
  464. self._write(tokens.TagAttrQuote())
  465. self._write_text(next[1:-1])
  466. self._head += 1
  467. else:
  468. if not re.search(r'[^\\]"', next[1:]):
  469. self._head += 1
  470. reset = self._head
  471. try:
  472. attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED |
  473. contexts.TAG_OPEN_ATTR_IGNORE)
  474. except BadRoute:
  475. self._head = reset
  476. self._write_text(next)
  477. else:
  478. self._write(tokens.TagAttrQuote())
  479. self._write_text(next[1:])
  480. self._write_all(attr)
  481. return
  482. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  483. self._context |= contexts.TAG_OPEN_ATTR_NAME
  484. while chunks:
  485. self._actually_handle_chunk(chunks, True)
  486. def _get_tag_from_stack(self, stack=None):
  487. """Return the tag based on the text in *stack*."""
  488. if not stack:
  489. sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen)
  490. func = lambda tok: not isinstance(tok, sentinels)
  491. stack = takewhile(func, self._stack)
  492. text = [tok.text for tok in stack if isinstance(tok, tokens.Text)]
  493. return "".join(text).rstrip().lower()
  494. def _handle_tag_close_open(self):
  495. """Handle the ending of an open tag (``<foo>``)."""
  496. padding = self._actually_close_tag_opening()
  497. if not self._get_tag_from_stack(): # Tags cannot be blank
  498. self._fail_route()
  499. self._write(tokens.TagCloseOpen(padding=padding))
  500. def _handle_tag_selfclose(self):
  501. """Handle the ending of an tag that closes itself (``<foo />``)."""
  502. padding = self._actually_close_tag_opening()
  503. if not self._get_tag_from_stack(): # Tags cannot be blank
  504. self._fail_route()
  505. self._write(tokens.TagCloseSelfclose(padding=padding))
  506. self._head += 1
  507. return self._pop()
  508. def _handle_tag_open_close(self):
  509. """Handle the opening of a closing tag (``</foo>``)."""
  510. self._write(tokens.TagOpenClose())
  511. self._push(contexts.TAG_CLOSE)
  512. self._head += 1
  513. def _handle_tag_close_close(self):
  514. """Handle the ending of a closing tag (``</foo>``)."""
  515. closing = self._pop()
  516. close_tag = self._get_tag_from_stack(closing)
  517. open_tag = self._get_tag_from_stack()
  518. if not close_tag or close_tag != open_tag:
  519. # Closing and opening tags are empty or unequal, so fail this tag:
  520. self._fail_route()
  521. self._write_all(closing)
  522. self._write(tokens.TagCloseClose())
  523. return self._pop()
  524. def _verify_safe(self, this):
  525. """Make sure we are not trying to write an invalid character."""
  526. context = self._context
  527. if context & contexts.FAIL_NEXT:
  528. return False
  529. if context & contexts.WIKILINK_TITLE:
  530. if this == "]" or this == "{":
  531. self._context |= contexts.FAIL_NEXT
  532. elif this == "\n" or this == "[" or this == "}":
  533. return False
  534. return True
  535. elif context & contexts.TEMPLATE_NAME:
  536. if this == "{" or this == "}" or this == "[":
  537. self._context |= contexts.FAIL_NEXT
  538. return True
  539. if this == "]":
  540. return False
  541. if this == "|":
  542. return True
  543. if context & contexts.HAS_TEXT:
  544. if context & contexts.FAIL_ON_TEXT:
  545. if this is self.END or not this.isspace():
  546. return False
  547. else:
  548. if this == "\n":
  549. self._context |= contexts.FAIL_ON_TEXT
  550. elif this is self.END or not this.isspace():
  551. self._context |= contexts.HAS_TEXT
  552. return True
  553. elif context & contexts.TAG_CLOSE:
  554. return this != "<" and this != "\n"
  555. else:
  556. if context & contexts.FAIL_ON_EQUALS:
  557. if this == "=":
  558. return False
  559. elif context & contexts.FAIL_ON_LBRACE:
  560. if this == "{" or (self._read(-1) == self._read(-2) == "{"):
  561. if context & contexts.TEMPLATE:
  562. self._context |= contexts.FAIL_ON_EQUALS
  563. else:
  564. self._context |= contexts.FAIL_NEXT
  565. return True
  566. self._context ^= contexts.FAIL_ON_LBRACE
  567. elif context & contexts.FAIL_ON_RBRACE:
  568. if this == "}":
  569. if context & contexts.TEMPLATE:
  570. self._context |= contexts.FAIL_ON_EQUALS
  571. else:
  572. self._context |= contexts.FAIL_NEXT
  573. return True
  574. self._context ^= contexts.FAIL_ON_RBRACE
  575. elif this == "{":
  576. self._context |= contexts.FAIL_ON_LBRACE
  577. elif this == "}":
  578. self._context |= contexts.FAIL_ON_RBRACE
  579. return True
  580. def _parse(self, context=0):
  581. """Parse the wikicode string, using *context* for when to stop."""
  582. self._push(context)
  583. while True:
  584. this = self._read()
  585. unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
  586. contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
  587. contexts.TAG_CLOSE)
  588. if self._context & unsafe:
  589. if not self._verify_safe(this):
  590. double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
  591. if self._context & double:
  592. self._pop()
  593. self._fail_route()
  594. if this not in self.MARKERS:
  595. if self._context & contexts.TAG_OPEN:
  596. should_exit = self._handle_tag_chunk(this)
  597. if should_exit:
  598. return should_exit
  599. else:
  600. self._write_text(this)
  601. self._head += 1
  602. continue
  603. if this is self.END:
  604. fail = (
  605. contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
  606. contexts.HEADING | contexts.COMMENT | contexts.TAG)
  607. if self._context & fail:
  608. double_fail = (
  609. contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
  610. contexts.TAG_OPEN_ATTR_QUOTED)
  611. if self._context & double_fail:
  612. self._pop()
  613. self._fail_route()
  614. return self._pop()
  615. next = self._read(1)
  616. if self._context & contexts.COMMENT:
  617. if this == next == "-" and self._read(2) == ">":
  618. return self._pop()
  619. else:
  620. self._write_text(this)
  621. elif this == next == "{":
  622. if self._can_recurse():
  623. self._parse_template_or_argument()
  624. if self._context & contexts.FAIL_NEXT:
  625. self._context ^= contexts.FAIL_NEXT
  626. else:
  627. self._write_text("{")
  628. elif this == "|" and self._context & contexts.TEMPLATE:
  629. self._handle_template_param()
  630. elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
  631. self._handle_template_param_value()
  632. elif this == next == "}" and self._context & contexts.TEMPLATE:
  633. return self._handle_template_end()
  634. elif this == "|" and self._context & contexts.ARGUMENT_NAME:
  635. self._handle_argument_separator()
  636. elif this == next == "}" and self._context & contexts.ARGUMENT:
  637. if self._read(2) == "}":
  638. return self._handle_argument_end()
  639. else:
  640. self._write_text("}")
  641. elif this == next == "[":
  642. if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
  643. self._parse_wikilink()
  644. if self._context & contexts.FAIL_NEXT:
  645. self._context ^= contexts.FAIL_NEXT
  646. else:
  647. self._write_text("[")
  648. elif this == "|" and self._context & contexts.WIKILINK_TITLE:
  649. self._handle_wikilink_separator()
  650. elif this == next == "]" and self._context & contexts.WIKILINK:
  651. return self._handle_wikilink_end()
  652. elif this == "=" and not self._global & contexts.GL_HEADING:
  653. if self._read(-1) in ("\n", self.START):
  654. self._parse_heading()
  655. elif self._context & contexts.TAG_OPEN_ATTR_NAME:
  656. self._handle_tag_attribute_body()
  657. else:
  658. self._write_text("=")
  659. elif this == "=" and self._context & contexts.HEADING:
  660. return self._handle_heading_end()
  661. elif this == "\n" and self._context & contexts.HEADING:
  662. self._fail_route()
  663. elif this == "&":
  664. self._parse_entity()
  665. elif this == "<" and next == "!":
  666. if self._read(2) == self._read(3) == "-":
  667. self._parse_comment()
  668. else:
  669. self._write_text(this)
  670. elif this == "<" and next != "/" and (
  671. not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
  672. self._parse_tag()
  673. elif self._context & contexts.TAG_OPEN:
  674. if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  675. self._handle_tag_chunk(this)
  676. elif this == "\n":
  677. self._fail_route()
  678. elif this == ">":
  679. self._handle_tag_close_open()
  680. elif this == "/" and next == ">":
  681. return self._handle_tag_selfclose()
  682. elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
  683. self._handle_tag_attribute_body()
  684. else:
  685. self._handle_tag_chunk(this)
  686. elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
  687. self._handle_tag_open_close()
  688. elif this == ">" and self._context & contexts.TAG_CLOSE:
  689. return self._handle_tag_close_close()
  690. else:
  691. self._write_text(this)
  692. self._head += 1
  693. def tokenize(self, text):
  694. """Build a list of tokens from a string of wikicode and return it."""
  695. split = self.regex.split(text)
  696. self._text = [segment for segment in split if segment]
  697. return self._parse()