A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

764 rivejä
29 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from math import log
  24. import re
  25. from . import contexts
  26. from . import tokens
  27. from ..compat import htmlentities
  28. from ..nodes.tag import Tag
  29. __all__ = ["Tokenizer"]
  30. class BadRoute(Exception):
  31. """Raised internally when the current tokenization route is invalid."""
  32. pass
  33. class Tokenizer(object):
  34. """Creates a list of tokens from a string of wikicode."""
  35. USES_C = False
  36. START = object()
  37. END = object()
  38. MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
  39. "/", "-", "!", "\n", END]
  40. MAX_DEPTH = 40
  41. MAX_CYCLES = 100000
  42. regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
  43. def __init__(self):
  44. self._text = None
  45. self._head = 0
  46. self._stacks = []
  47. self._global = 0
  48. self._depth = 0
  49. self._cycles = 0
  50. @property
  51. def _stack(self):
  52. """The current token stack."""
  53. return self._stacks[-1][0]
  54. @property
  55. def _context(self):
  56. """The current token context."""
  57. return self._stacks[-1][1]
  58. @_context.setter
  59. def _context(self, value):
  60. self._stacks[-1][1] = value
  61. @property
  62. def _textbuffer(self):
  63. """The current textbuffer."""
  64. return self._stacks[-1][2]
  65. @_textbuffer.setter
  66. def _textbuffer(self, value):
  67. self._stacks[-1][2] = value
  68. def _push(self, context=0):
  69. """Add a new token stack, context, and textbuffer to the list."""
  70. self._stacks.append([[], context, []])
  71. self._depth += 1
  72. self._cycles += 1
  73. def _push_textbuffer(self):
  74. """Push the textbuffer onto the stack as a Text node and clear it."""
  75. if self._textbuffer:
  76. self._stack.append(tokens.Text(text="".join(self._textbuffer)))
  77. self._textbuffer = []
  78. def _pop(self, keep_context=False):
  79. """Pop the current stack/context/textbuffer, returing the stack.
  80. If *keep_context* is ``True``, then we will replace the underlying
  81. stack's context with the current stack's.
  82. """
  83. self._push_textbuffer()
  84. self._depth -= 1
  85. if keep_context:
  86. context = self._context
  87. stack = self._stacks.pop()[0]
  88. self._context = context
  89. return stack
  90. return self._stacks.pop()[0]
  91. def _can_recurse(self):
  92. """Return whether or not our max recursion depth has been exceeded."""
  93. return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES
  94. def _fail_route(self):
  95. """Fail the current tokenization route.
  96. Discards the current stack/context/textbuffer and raises
  97. :py:exc:`~.BadRoute`.
  98. """
  99. self._pop()
  100. raise BadRoute()
  101. def _write(self, token):
  102. """Write a token to the end of the current token stack."""
  103. self._push_textbuffer()
  104. self._stack.append(token)
  105. def _write_first(self, token):
  106. """Write a token to the beginning of the current token stack."""
  107. self._push_textbuffer()
  108. self._stack.insert(0, token)
  109. def _write_text(self, text):
  110. """Write text to the current textbuffer."""
  111. self._textbuffer.append(text)
  112. def _write_all(self, tokenlist):
  113. """Write a series of tokens to the current stack at once."""
  114. if tokenlist and isinstance(tokenlist[0], tokens.Text):
  115. self._write_text(tokenlist.pop(0).text)
  116. self._push_textbuffer()
  117. self._stack.extend(tokenlist)
  118. def _write_text_then_stack(self, text):
  119. """Pop the current stack, write *text*, and then write the stack."""
  120. stack = self._pop()
  121. self._write_text(text)
  122. if stack:
  123. self._write_all(stack)
  124. self._head -= 1
  125. def _read(self, delta=0, wrap=False, strict=False):
  126. """Read the value at a relative point in the wikicode.
  127. The value is read from :py:attr:`self._head <_head>` plus the value of
  128. *delta* (which can be negative). If *wrap* is ``False``, we will not
  129. allow attempts to read from the end of the string if ``self._head +
  130. delta`` is negative. If *strict* is ``True``, the route will be failed
  131. (with :py:meth:`_fail_route`) if we try to read from past the end of
  132. the string; otherwise, :py:attr:`self.END <END>` is returned. If we try
  133. to read from before the start of the string, :py:attr:`self.START
  134. <START>` is returned.
  135. """
  136. index = self._head + delta
  137. if index < 0 and (not wrap or abs(index) > len(self._text)):
  138. return self.START
  139. try:
  140. return self._text[index]
  141. except IndexError:
  142. if strict:
  143. self._fail_route()
  144. return self.END
  145. def _parse_template_or_argument(self):
  146. """Parse a template or argument at the head of the wikicode string."""
  147. self._head += 2
  148. braces = 2
  149. while self._read() == "{":
  150. self._head += 1
  151. braces += 1
  152. self._push()
  153. while braces:
  154. if braces == 1:
  155. return self._write_text_then_stack("{")
  156. if braces == 2:
  157. try:
  158. self._parse_template()
  159. except BadRoute:
  160. return self._write_text_then_stack("{{")
  161. break
  162. try:
  163. self._parse_argument()
  164. braces -= 3
  165. except BadRoute:
  166. try:
  167. self._parse_template()
  168. braces -= 2
  169. except BadRoute:
  170. return self._write_text_then_stack("{" * braces)
  171. if braces:
  172. self._head += 1
  173. self._write_all(self._pop())
  174. def _parse_template(self):
  175. """Parse a template at the head of the wikicode string."""
  176. reset = self._head
  177. try:
  178. template = self._parse(contexts.TEMPLATE_NAME)
  179. except BadRoute:
  180. self._head = reset
  181. raise
  182. self._write_first(tokens.TemplateOpen())
  183. self._write_all(template)
  184. self._write(tokens.TemplateClose())
  185. def _parse_argument(self):
  186. """Parse an argument at the head of the wikicode string."""
  187. reset = self._head
  188. try:
  189. argument = self._parse(contexts.ARGUMENT_NAME)
  190. except BadRoute:
  191. self._head = reset
  192. raise
  193. self._write_first(tokens.ArgumentOpen())
  194. self._write_all(argument)
  195. self._write(tokens.ArgumentClose())
  196. def _handle_template_param(self):
  197. """Handle a template parameter at the head of the string."""
  198. if self._context & contexts.TEMPLATE_NAME:
  199. self._context ^= contexts.TEMPLATE_NAME
  200. elif self._context & contexts.TEMPLATE_PARAM_VALUE:
  201. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  202. elif self._context & contexts.TEMPLATE_PARAM_KEY:
  203. self._write_all(self._pop(keep_context=True))
  204. self._context |= contexts.TEMPLATE_PARAM_KEY
  205. self._write(tokens.TemplateParamSeparator())
  206. self._push(self._context)
  207. def _handle_template_param_value(self):
  208. """Handle a template parameter's value at the head of the string."""
  209. self._write_all(self._pop(keep_context=True))
  210. self._context ^= contexts.TEMPLATE_PARAM_KEY
  211. self._context |= contexts.TEMPLATE_PARAM_VALUE
  212. self._write(tokens.TemplateParamEquals())
  213. def _handle_template_end(self):
  214. """Handle the end of a template at the head of the string."""
  215. if self._context & contexts.TEMPLATE_PARAM_KEY:
  216. self._write_all(self._pop(keep_context=True))
  217. self._head += 1
  218. return self._pop()
  219. def _handle_argument_separator(self):
  220. """Handle the separator between an argument's name and default."""
  221. self._context ^= contexts.ARGUMENT_NAME
  222. self._context |= contexts.ARGUMENT_DEFAULT
  223. self._write(tokens.ArgumentSeparator())
  224. def _handle_argument_end(self):
  225. """Handle the end of an argument at the head of the string."""
  226. self._head += 2
  227. return self._pop()
  228. def _parse_wikilink(self):
  229. """Parse an internal wikilink at the head of the wikicode string."""
  230. self._head += 2
  231. reset = self._head - 1
  232. try:
  233. wikilink = self._parse(contexts.WIKILINK_TITLE)
  234. except BadRoute:
  235. self._head = reset
  236. self._write_text("[[")
  237. else:
  238. self._write(tokens.WikilinkOpen())
  239. self._write_all(wikilink)
  240. self._write(tokens.WikilinkClose())
  241. def _handle_wikilink_separator(self):
  242. """Handle the separator between a wikilink's title and its text."""
  243. self._context ^= contexts.WIKILINK_TITLE
  244. self._context |= contexts.WIKILINK_TEXT
  245. self._write(tokens.WikilinkSeparator())
  246. def _handle_wikilink_end(self):
  247. """Handle the end of a wikilink at the head of the string."""
  248. self._head += 1
  249. return self._pop()
  250. def _parse_heading(self):
  251. """Parse a section heading at the head of the wikicode string."""
  252. self._global |= contexts.GL_HEADING
  253. reset = self._head
  254. self._head += 1
  255. best = 1
  256. while self._read() == "=":
  257. best += 1
  258. self._head += 1
  259. context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
  260. try:
  261. title, level = self._parse(context)
  262. except BadRoute:
  263. self._head = reset + best - 1
  264. self._write_text("=" * best)
  265. else:
  266. self._write(tokens.HeadingStart(level=level))
  267. if level < best:
  268. self._write_text("=" * (best - level))
  269. self._write_all(title)
  270. self._write(tokens.HeadingEnd())
  271. finally:
  272. self._global ^= contexts.GL_HEADING
  273. def _handle_heading_end(self):
  274. """Handle the end of a section heading at the head of the string."""
  275. reset = self._head
  276. self._head += 1
  277. best = 1
  278. while self._read() == "=":
  279. best += 1
  280. self._head += 1
  281. current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
  282. level = min(current, min(best, 6))
  283. try: # Try to check for a heading closure after this one
  284. after, after_level = self._parse(self._context)
  285. except BadRoute:
  286. if level < best:
  287. self._write_text("=" * (best - level))
  288. self._head = reset + best - 1
  289. return self._pop(), level
  290. else: # Found another closure
  291. self._write_text("=" * best)
  292. self._write_all(after)
  293. return self._pop(), after_level
  294. def _really_parse_entity(self):
  295. """Actually parse an HTML entity and ensure that it is valid."""
  296. self._write(tokens.HTMLEntityStart())
  297. self._head += 1
  298. this = self._read(strict=True)
  299. if this == "#":
  300. numeric = True
  301. self._write(tokens.HTMLEntityNumeric())
  302. self._head += 1
  303. this = self._read(strict=True)
  304. if this[0].lower() == "x":
  305. hexadecimal = True
  306. self._write(tokens.HTMLEntityHex(char=this[0]))
  307. this = this[1:]
  308. if not this:
  309. self._fail_route()
  310. else:
  311. hexadecimal = False
  312. else:
  313. numeric = hexadecimal = False
  314. valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789"
  315. if not numeric and not hexadecimal:
  316. valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  317. if not all([char in valid for char in this]):
  318. self._fail_route()
  319. self._head += 1
  320. if self._read() != ";":
  321. self._fail_route()
  322. if numeric:
  323. test = int(this, 16) if hexadecimal else int(this)
  324. if test < 1 or test > 0x10FFFF:
  325. self._fail_route()
  326. else:
  327. if this not in htmlentities.entitydefs:
  328. self._fail_route()
  329. self._write(tokens.Text(text=this))
  330. self._write(tokens.HTMLEntityEnd())
  331. def _parse_entity(self):
  332. """Parse an HTML entity at the head of the wikicode string."""
  333. reset = self._head
  334. self._push()
  335. try:
  336. self._really_parse_entity()
  337. except BadRoute:
  338. self._head = reset
  339. self._write_text(self._read())
  340. else:
  341. self._write_all(self._pop())
  342. def _parse_comment(self):
  343. """Parse an HTML comment at the head of the wikicode string."""
  344. self._head += 4
  345. reset = self._head - 1
  346. try:
  347. comment = self._parse(contexts.COMMENT)
  348. except BadRoute:
  349. self._head = reset
  350. self._write_text("<!--")
  351. else:
  352. self._write(tokens.CommentStart())
  353. self._write_all(comment)
  354. self._write(tokens.CommentEnd())
  355. self._head += 2
  356. def _parse_tag(self):
  357. """Parse an HTML tag at the head of the wikicode string."""
  358. reset = self._head
  359. self._head += 1
  360. try:
  361. tokens = self._parse(contexts.TAG_OPEN_NAME)
  362. except BadRoute:
  363. self._head = reset
  364. self._write_text("<")
  365. else:
  366. self._write_all(tokens)
  367. def _get_tag_type_from_stack(self, stack=None):
  368. """Return the tag type based on the text in *stack*.
  369. If *stack* is ``None``, we will use the current, topmost one.
  370. """
  371. if stack is None:
  372. stack = self._stack
  373. self._push_textbuffer()
  374. if not stack:
  375. self._fail_route() # Tag has an empty name?
  376. text = [tok for tok in stack if isinstance(tok, tokens.Text)]
  377. text = "".join([token.text for token in text]).rstrip().lower()
  378. try:
  379. return Tag.TRANSLATIONS[text]
  380. except KeyError:
  381. return Tag.TAG_UNKNOWN
  382. def _actually_close_tag_opening(self):
  383. """Handle cleanup at the end of a opening tag.
  384. The current context will be updated and the
  385. :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the
  386. opening tag's padding to be used in the
  387. :py:class:`~.tokens.TagOpenClose` token.
  388. """
  389. if self._context & contexts.TAG_OPEN_ATTR:
  390. if self._context & contexts.TAG_OPEN_ATTR_NAME:
  391. self._context ^= contexts.TAG_OPEN_ATTR_NAME
  392. if self._context & contexts.TAG_OPEN_ATTR_BODY:
  393. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  394. else:
  395. tag = self._get_tag_type_from_stack()
  396. self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
  397. self._context ^= contexts.TAG_OPEN_NAME
  398. self._context |= contexts.TAG_BODY
  399. if isinstance(self._stack[-1], tokens.TagAttrStart):
  400. return self._stack.pop().padding
  401. return ""
  402. def _actually_handle_chunk(self, chunks, is_new):
  403. """Actually handle a chunk of code within a tag's attributes.
  404. Called by :py:meth:`_handle_tag_chunk` and
  405. :py:meth:`_handle_tag_attribute_body`.
  406. """
  407. if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  408. padding = 0
  409. while chunks:
  410. if chunks[0] == "":
  411. padding += 1
  412. chunks.pop(0)
  413. else:
  414. break
  415. self._write(tokens.TagAttrStart(padding=" " * padding))
  416. elif self._context & contexts.TAG_OPEN_ATTR_IGNORE:
  417. self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
  418. chunks.pop(0)
  419. return
  420. elif self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  421. self._write_text(" ") # Quoted chunks don't lose their spaces
  422. if chunks:
  423. chunk = chunks.pop(0)
  424. if self._context & contexts.TAG_OPEN_ATTR_BODY:
  425. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  426. self._context |= contexts.TAG_OPEN_ATTR_NAME
  427. if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  428. if re.search(r'[^\\]"', chunk[:-1]):
  429. self._fail_route()
  430. if re.search(r'[^\\]"$', chunk):
  431. self._write_text(chunk[:-1])
  432. self._context ^= contexts.TAG_OPEN_ATTR_QUOTED
  433. self._context |= contexts.TAG_OPEN_ATTR_NAME
  434. return True # Back to _handle_tag_attribute_body()
  435. self._write_text(chunk)
  436. def _handle_tag_chunk(self, text):
  437. """Handle a chunk of code within a tag's attributes.
  438. This is called by :py:meth:`_parse`, which intercepts parsing of
  439. wikicode when we're inside of an opening tag and no :py:attr:`MARKERS`
  440. are present.
  441. """
  442. if " " not in text:
  443. self._write_text(text)
  444. return
  445. chunks = text.split(" ")
  446. is_new = False
  447. is_quoted = False
  448. if self._context & contexts.TAG_OPEN_NAME:
  449. self._write_text(chunks.pop(0))
  450. tag = self._get_tag_type_from_stack()
  451. self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
  452. self._context ^= contexts.TAG_OPEN_NAME
  453. self._context |= contexts.TAG_OPEN_ATTR_NAME
  454. self._actually_handle_chunk(chunks, True)
  455. is_new = True
  456. while chunks:
  457. result = self._actually_handle_chunk(chunks, is_new)
  458. is_quoted = result or is_quoted
  459. is_new = True
  460. if is_quoted:
  461. return self._pop()
  462. def _handle_tag_attribute_body(self):
  463. """Handle the body, or value, of a tag attribute.
  464. Attribute bodies can usually be handled at once, but sometimes a new
  465. stack must be created to keep track of "rich" attribute values that
  466. contain, for example, templates.
  467. """
  468. self._context ^= contexts.TAG_OPEN_ATTR_NAME
  469. self._context |= contexts.TAG_OPEN_ATTR_BODY
  470. self._write(tokens.TagAttrEquals())
  471. next = self._read(1)
  472. if next not in self.MARKERS and next.startswith('"'):
  473. chunks = None
  474. if " " in next:
  475. chunks = next.split(" ")
  476. next = chunks.pop(0)
  477. if re.search(r'[^\\]"$', next[1:]):
  478. if not re.search(r'[^\\]"', next[1:-1]):
  479. self._write(tokens.TagAttrQuote())
  480. self._write_text(next[1:-1])
  481. self._head += 1
  482. else:
  483. if not re.search(r'[^\\]"', next[1:]):
  484. self._head += 1
  485. reset = self._head
  486. try:
  487. attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED |
  488. contexts.TAG_OPEN_ATTR_IGNORE)
  489. except BadRoute:
  490. self._head = reset
  491. self._write_text(next)
  492. else:
  493. self._write(tokens.TagAttrQuote())
  494. self._write_text(next[1:])
  495. self._write_all(attr)
  496. return
  497. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  498. self._context |= contexts.TAG_OPEN_ATTR_NAME
  499. while chunks:
  500. self._actually_handle_chunk(chunks, True)
  501. def _handle_tag_close_open(self):
  502. """Handle the ending of an open tag (``<foo>``)."""
  503. padding = self._actually_close_tag_opening()
  504. self._write(tokens.TagCloseOpen(padding=padding))
  505. def _handle_tag_selfclose(self):
  506. """Handle the ending of an tag that closes itself (``<foo />``)."""
  507. padding = self._actually_close_tag_opening()
  508. self._write(tokens.TagCloseSelfclose(padding=padding))
  509. self._head += 1
  510. return self._pop()
  511. def _handle_tag_open_close(self):
  512. """Handle the opening of a closing tag (``</foo>``)."""
  513. self._write(tokens.TagOpenClose())
  514. self._push(contexts.TAG_CLOSE)
  515. self._head += 1
  516. def _handle_tag_close_close(self):
  517. """Handle the ending of a closing tag (``</foo>``)."""
  518. closing = self._pop()
  519. tag = self._get_tag_type_from_stack(closing)
  520. if tag != self._stack[0].type:
  521. # Closing and opening tags are not the same, so fail this route:
  522. self._fail_route()
  523. self._write_all(closing)
  524. self._write(tokens.TagCloseClose())
  525. return self._pop()
  526. def _verify_safe(self, this):
  527. """Make sure we are not trying to write an invalid character."""
  528. context = self._context
  529. if context & contexts.FAIL_NEXT:
  530. return False
  531. if context & contexts.WIKILINK_TITLE:
  532. if this == "]" or this == "{":
  533. self._context |= contexts.FAIL_NEXT
  534. elif this == "\n" or this == "[" or this == "}":
  535. return False
  536. return True
  537. if context & contexts.TEMPLATE_NAME:
  538. if this == "{" or this == "}" or this == "[":
  539. self._context |= contexts.FAIL_NEXT
  540. return True
  541. if this == "]":
  542. return False
  543. if this == "|":
  544. return True
  545. if context & contexts.HAS_TEXT:
  546. if context & contexts.FAIL_ON_TEXT:
  547. if this is self.END or not this.isspace():
  548. return False
  549. else:
  550. if this == "\n":
  551. self._context |= contexts.FAIL_ON_TEXT
  552. elif this is self.END or not this.isspace():
  553. self._context |= contexts.HAS_TEXT
  554. return True
  555. else:
  556. if context & contexts.FAIL_ON_EQUALS:
  557. if this == "=":
  558. return False
  559. elif context & contexts.FAIL_ON_LBRACE:
  560. if this == "{" or (self._read(-1) == self._read(-2) == "{"):
  561. if context & contexts.TEMPLATE:
  562. self._context |= contexts.FAIL_ON_EQUALS
  563. else:
  564. self._context |= contexts.FAIL_NEXT
  565. return True
  566. self._context ^= contexts.FAIL_ON_LBRACE
  567. elif context & contexts.FAIL_ON_RBRACE:
  568. if this == "}":
  569. if context & contexts.TEMPLATE:
  570. self._context |= contexts.FAIL_ON_EQUALS
  571. else:
  572. self._context |= contexts.FAIL_NEXT
  573. return True
  574. self._context ^= contexts.FAIL_ON_RBRACE
  575. elif this == "{":
  576. self._context |= contexts.FAIL_ON_LBRACE
  577. elif this == "}":
  578. self._context |= contexts.FAIL_ON_RBRACE
  579. return True
  580. def _parse(self, context=0):
  581. """Parse the wikicode string, using *context* for when to stop."""
  582. self._push(context)
  583. while True:
  584. this = self._read()
  585. unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
  586. contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME)
  587. if self._context & unsafe:
  588. if not self._verify_safe(this):
  589. if self._context & contexts.TEMPLATE_PARAM_KEY:
  590. self._pop()
  591. self._fail_route()
  592. if this not in self.MARKERS:
  593. if self._context & contexts.TAG_OPEN:
  594. should_exit = self._handle_tag_chunk(this)
  595. if should_exit:
  596. return should_exit
  597. else:
  598. self._write_text(this)
  599. self._head += 1
  600. continue
  601. if this is self.END:
  602. fail = (
  603. contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
  604. contexts.HEADING | contexts.COMMENT | contexts.TAG)
  605. double_fail = (
  606. contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
  607. contexts.TAG_OPEN_ATTR_QUOTED)
  608. if self._context & double_fail:
  609. self._pop()
  610. if self._context & fail:
  611. self._fail_route()
  612. return self._pop()
  613. next = self._read(1)
  614. if self._context & contexts.COMMENT:
  615. if this == next == "-" and self._read(2) == ">":
  616. return self._pop()
  617. else:
  618. self._write_text(this)
  619. elif this == next == "{":
  620. if self._can_recurse():
  621. self._parse_template_or_argument()
  622. if self._context & contexts.FAIL_NEXT:
  623. self._context ^= contexts.FAIL_NEXT
  624. else:
  625. self._write_text("{")
  626. elif this == "|" and self._context & contexts.TEMPLATE:
  627. self._handle_template_param()
  628. elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
  629. self._handle_template_param_value()
  630. elif this == next == "}" and self._context & contexts.TEMPLATE:
  631. return self._handle_template_end()
  632. elif this == "|" and self._context & contexts.ARGUMENT_NAME:
  633. self._handle_argument_separator()
  634. elif this == next == "}" and self._context & contexts.ARGUMENT:
  635. if self._read(2) == "}":
  636. return self._handle_argument_end()
  637. else:
  638. self._write_text("}")
  639. elif this == next == "[":
  640. if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
  641. self._parse_wikilink()
  642. if self._context & contexts.FAIL_NEXT:
  643. self._context ^= contexts.FAIL_NEXT
  644. else:
  645. self._write_text("[")
  646. elif this == "|" and self._context & contexts.WIKILINK_TITLE:
  647. self._handle_wikilink_separator()
  648. elif this == next == "]" and self._context & contexts.WIKILINK:
  649. return self._handle_wikilink_end()
  650. elif this == "=" and not self._global & contexts.GL_HEADING:
  651. if self._read(-1) in ("\n", self.START):
  652. self._parse_heading()
  653. elif self._context & contexts.TAG_OPEN_ATTR_NAME:
  654. self._handle_tag_attribute_body()
  655. else:
  656. self._write_text("=")
  657. elif this == "=" and self._context & contexts.HEADING:
  658. return self._handle_heading_end()
  659. elif this == "\n" and self._context & contexts.HEADING:
  660. self._fail_route()
  661. elif this == "&":
  662. self._parse_entity()
  663. elif this == "<" and next == "!":
  664. if self._read(2) == self._read(3) == "-":
  665. self._parse_comment()
  666. else:
  667. self._write_text(this)
  668. elif this == "<" and next != "/" and (
  669. not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
  670. self._parse_tag()
  671. elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED):
  672. if this == "\n":
  673. if self._context & contexts.TAG_CLOSE:
  674. self._pop()
  675. self._fail_route()
  676. elif this == ">":
  677. self._handle_tag_close_open()
  678. elif this == "/" and next == ">":
  679. return self._handle_tag_selfclose()
  680. elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
  681. self._handle_tag_attribute_body()
  682. elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
  683. self._handle_tag_open_close()
  684. elif this == ">" and self._context & contexts.TAG_CLOSE:
  685. return self._handle_tag_close_close()
  686. else:
  687. self._write_text(this)
  688. self._head += 1
  689. def tokenize(self, text):
  690. """Build a list of tokens from a string of wikicode and return it."""
  691. split = self.regex.split(text)
  692. self._text = [segment for segment in split if segment]
  693. return self._parse()