A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

764 rivejä
29 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from math import log
  24. import re
  25. from . import contexts
  26. from . import tokens
  27. from ..compat import htmlentities
  28. from ..tag_defs import is_parsable
  29. __all__ = ["Tokenizer"]
  30. class BadRoute(Exception):
  31. """Raised internally when the current tokenization route is invalid."""
  32. pass
  33. class Tokenizer(object):
  34. """Creates a list of tokens from a string of wikicode."""
  35. USES_C = False
  36. START = object()
  37. END = object()
  38. MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
  39. "/", "-", "!", "\n", END]
  40. MAX_DEPTH = 40
  41. MAX_CYCLES = 100000
  42. regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
  43. def __init__(self):
  44. self._text = None
  45. self._head = 0
  46. self._stacks = []
  47. self._global = 0
  48. self._depth = 0
  49. self._cycles = 0
  50. @property
  51. def _stack(self):
  52. """The current token stack."""
  53. return self._stacks[-1][0]
  54. @property
  55. def _context(self):
  56. """The current token context."""
  57. return self._stacks[-1][1]
  58. @_context.setter
  59. def _context(self, value):
  60. self._stacks[-1][1] = value
  61. @property
  62. def _textbuffer(self):
  63. """The current textbuffer."""
  64. return self._stacks[-1][2]
  65. @_textbuffer.setter
  66. def _textbuffer(self, value):
  67. self._stacks[-1][2] = value
  68. def _push(self, context=0):
  69. """Add a new token stack, context, and textbuffer to the list."""
  70. self._stacks.append([[], context, []])
  71. self._depth += 1
  72. self._cycles += 1
  73. def _push_textbuffer(self):
  74. """Push the textbuffer onto the stack as a Text node and clear it."""
  75. if self._textbuffer:
  76. self._stack.append(tokens.Text(text="".join(self._textbuffer)))
  77. self._textbuffer = []
  78. def _pop(self, keep_context=False):
  79. """Pop the current stack/context/textbuffer, returing the stack.
  80. If *keep_context* is ``True``, then we will replace the underlying
  81. stack's context with the current stack's.
  82. """
  83. self._push_textbuffer()
  84. self._depth -= 1
  85. if keep_context:
  86. context = self._context
  87. stack = self._stacks.pop()[0]
  88. self._context = context
  89. return stack
  90. return self._stacks.pop()[0]
  91. def _can_recurse(self):
  92. """Return whether or not our max recursion depth has been exceeded."""
  93. return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES
  94. def _fail_route(self):
  95. """Fail the current tokenization route.
  96. Discards the current stack/context/textbuffer and raises
  97. :py:exc:`~.BadRoute`.
  98. """
  99. self._pop()
  100. raise BadRoute()
  101. def _write(self, token):
  102. """Write a token to the end of the current token stack."""
  103. self._push_textbuffer()
  104. self._stack.append(token)
  105. def _write_first(self, token):
  106. """Write a token to the beginning of the current token stack."""
  107. self._push_textbuffer()
  108. self._stack.insert(0, token)
  109. def _write_text(self, text):
  110. """Write text to the current textbuffer."""
  111. self._textbuffer.append(text)
  112. def _write_all(self, tokenlist):
  113. """Write a series of tokens to the current stack at once."""
  114. if tokenlist and isinstance(tokenlist[0], tokens.Text):
  115. self._write_text(tokenlist.pop(0).text)
  116. self._push_textbuffer()
  117. self._stack.extend(tokenlist)
  118. def _write_text_then_stack(self, text):
  119. """Pop the current stack, write *text*, and then write the stack."""
  120. stack = self._pop()
  121. self._write_text(text)
  122. if stack:
  123. self._write_all(stack)
  124. self._head -= 1
  125. def _read(self, delta=0, wrap=False, strict=False):
  126. """Read the value at a relative point in the wikicode.
  127. The value is read from :py:attr:`self._head <_head>` plus the value of
  128. *delta* (which can be negative). If *wrap* is ``False``, we will not
  129. allow attempts to read from the end of the string if ``self._head +
  130. delta`` is negative. If *strict* is ``True``, the route will be failed
  131. (with :py:meth:`_fail_route`) if we try to read from past the end of
  132. the string; otherwise, :py:attr:`self.END <END>` is returned. If we try
  133. to read from before the start of the string, :py:attr:`self.START
  134. <START>` is returned.
  135. """
  136. index = self._head + delta
  137. if index < 0 and (not wrap or abs(index) > len(self._text)):
  138. return self.START
  139. try:
  140. return self._text[index]
  141. except IndexError:
  142. if strict:
  143. self._fail_route()
  144. return self.END
  145. def _parse_template_or_argument(self):
  146. """Parse a template or argument at the head of the wikicode string."""
  147. self._head += 2
  148. braces = 2
  149. while self._read() == "{":
  150. self._head += 1
  151. braces += 1
  152. self._push()
  153. while braces:
  154. if braces == 1:
  155. return self._write_text_then_stack("{")
  156. if braces == 2:
  157. try:
  158. self._parse_template()
  159. except BadRoute:
  160. return self._write_text_then_stack("{{")
  161. break
  162. try:
  163. self._parse_argument()
  164. braces -= 3
  165. except BadRoute:
  166. try:
  167. self._parse_template()
  168. braces -= 2
  169. except BadRoute:
  170. return self._write_text_then_stack("{" * braces)
  171. if braces:
  172. self._head += 1
  173. self._write_all(self._pop())
  174. def _parse_template(self):
  175. """Parse a template at the head of the wikicode string."""
  176. reset = self._head
  177. try:
  178. template = self._parse(contexts.TEMPLATE_NAME)
  179. except BadRoute:
  180. self._head = reset
  181. raise
  182. self._write_first(tokens.TemplateOpen())
  183. self._write_all(template)
  184. self._write(tokens.TemplateClose())
  185. def _parse_argument(self):
  186. """Parse an argument at the head of the wikicode string."""
  187. reset = self._head
  188. try:
  189. argument = self._parse(contexts.ARGUMENT_NAME)
  190. except BadRoute:
  191. self._head = reset
  192. raise
  193. self._write_first(tokens.ArgumentOpen())
  194. self._write_all(argument)
  195. self._write(tokens.ArgumentClose())
  196. def _handle_template_param(self):
  197. """Handle a template parameter at the head of the string."""
  198. if self._context & contexts.TEMPLATE_NAME:
  199. self._context ^= contexts.TEMPLATE_NAME
  200. elif self._context & contexts.TEMPLATE_PARAM_VALUE:
  201. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  202. elif self._context & contexts.TEMPLATE_PARAM_KEY:
  203. self._write_all(self._pop(keep_context=True))
  204. self._context |= contexts.TEMPLATE_PARAM_KEY
  205. self._write(tokens.TemplateParamSeparator())
  206. self._push(self._context)
  207. def _handle_template_param_value(self):
  208. """Handle a template parameter's value at the head of the string."""
  209. self._write_all(self._pop(keep_context=True))
  210. self._context ^= contexts.TEMPLATE_PARAM_KEY
  211. self._context |= contexts.TEMPLATE_PARAM_VALUE
  212. self._write(tokens.TemplateParamEquals())
  213. def _handle_template_end(self):
  214. """Handle the end of a template at the head of the string."""
  215. if self._context & contexts.TEMPLATE_PARAM_KEY:
  216. self._write_all(self._pop(keep_context=True))
  217. self._head += 1
  218. return self._pop()
  219. def _handle_argument_separator(self):
  220. """Handle the separator between an argument's name and default."""
  221. self._context ^= contexts.ARGUMENT_NAME
  222. self._context |= contexts.ARGUMENT_DEFAULT
  223. self._write(tokens.ArgumentSeparator())
  224. def _handle_argument_end(self):
  225. """Handle the end of an argument at the head of the string."""
  226. self._head += 2
  227. return self._pop()
  228. def _parse_wikilink(self):
  229. """Parse an internal wikilink at the head of the wikicode string."""
  230. self._head += 2
  231. reset = self._head - 1
  232. try:
  233. wikilink = self._parse(contexts.WIKILINK_TITLE)
  234. except BadRoute:
  235. self._head = reset
  236. self._write_text("[[")
  237. else:
  238. self._write(tokens.WikilinkOpen())
  239. self._write_all(wikilink)
  240. self._write(tokens.WikilinkClose())
  241. def _handle_wikilink_separator(self):
  242. """Handle the separator between a wikilink's title and its text."""
  243. self._context ^= contexts.WIKILINK_TITLE
  244. self._context |= contexts.WIKILINK_TEXT
  245. self._write(tokens.WikilinkSeparator())
  246. def _handle_wikilink_end(self):
  247. """Handle the end of a wikilink at the head of the string."""
  248. self._head += 1
  249. return self._pop()
  250. def _parse_heading(self):
  251. """Parse a section heading at the head of the wikicode string."""
  252. self._global |= contexts.GL_HEADING
  253. reset = self._head
  254. self._head += 1
  255. best = 1
  256. while self._read() == "=":
  257. best += 1
  258. self._head += 1
  259. context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
  260. try:
  261. title, level = self._parse(context)
  262. except BadRoute:
  263. self._head = reset + best - 1
  264. self._write_text("=" * best)
  265. else:
  266. self._write(tokens.HeadingStart(level=level))
  267. if level < best:
  268. self._write_text("=" * (best - level))
  269. self._write_all(title)
  270. self._write(tokens.HeadingEnd())
  271. finally:
  272. self._global ^= contexts.GL_HEADING
  273. def _handle_heading_end(self):
  274. """Handle the end of a section heading at the head of the string."""
  275. reset = self._head
  276. self._head += 1
  277. best = 1
  278. while self._read() == "=":
  279. best += 1
  280. self._head += 1
  281. current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
  282. level = min(current, min(best, 6))
  283. try: # Try to check for a heading closure after this one
  284. after, after_level = self._parse(self._context)
  285. except BadRoute:
  286. if level < best:
  287. self._write_text("=" * (best - level))
  288. self._head = reset + best - 1
  289. return self._pop(), level
  290. else: # Found another closure
  291. self._write_text("=" * best)
  292. self._write_all(after)
  293. return self._pop(), after_level
  294. def _really_parse_entity(self):
  295. """Actually parse an HTML entity and ensure that it is valid."""
  296. self._write(tokens.HTMLEntityStart())
  297. self._head += 1
  298. this = self._read(strict=True)
  299. if this == "#":
  300. numeric = True
  301. self._write(tokens.HTMLEntityNumeric())
  302. self._head += 1
  303. this = self._read(strict=True)
  304. if this[0].lower() == "x":
  305. hexadecimal = True
  306. self._write(tokens.HTMLEntityHex(char=this[0]))
  307. this = this[1:]
  308. if not this:
  309. self._fail_route()
  310. else:
  311. hexadecimal = False
  312. else:
  313. numeric = hexadecimal = False
  314. valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789"
  315. if not numeric and not hexadecimal:
  316. valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  317. if not all([char in valid for char in this]):
  318. self._fail_route()
  319. self._head += 1
  320. if self._read() != ";":
  321. self._fail_route()
  322. if numeric:
  323. test = int(this, 16) if hexadecimal else int(this)
  324. if test < 1 or test > 0x10FFFF:
  325. self._fail_route()
  326. else:
  327. if this not in htmlentities.entitydefs:
  328. self._fail_route()
  329. self._write(tokens.Text(text=this))
  330. self._write(tokens.HTMLEntityEnd())
  331. def _parse_entity(self):
  332. """Parse an HTML entity at the head of the wikicode string."""
  333. reset = self._head
  334. self._push()
  335. try:
  336. self._really_parse_entity()
  337. except BadRoute:
  338. self._head = reset
  339. self._write_text(self._read())
  340. else:
  341. self._write_all(self._pop())
  342. def _parse_comment(self):
  343. """Parse an HTML comment at the head of the wikicode string."""
  344. self._head += 4
  345. reset = self._head - 1
  346. try:
  347. comment = self._parse(contexts.COMMENT)
  348. except BadRoute:
  349. self._head = reset
  350. self._write_text("<!--")
  351. else:
  352. self._write(tokens.CommentStart())
  353. self._write_all(comment)
  354. self._write(tokens.CommentEnd())
  355. self._head += 2
  356. def _parse_tag(self):
  357. """Parse an HTML tag at the head of the wikicode string."""
  358. reset = self._head
  359. self._head += 1
  360. try:
  361. tokens = self._parse(contexts.TAG_OPEN_NAME)
  362. except BadRoute:
  363. self._head = reset
  364. self._write_text("<")
  365. else:
  366. self._write_all(tokens)
  367. def _get_tag_from_stack(self, stack=None):
  368. """Return the tag based on the text in *stack*.
  369. If *stack* is ``None``, we will use the current, topmost one.
  370. """
  371. if stack is None:
  372. stack = self._stack
  373. self._push_textbuffer()
  374. if not stack:
  375. self._fail_route() # Tag has an empty name?
  376. text = [tok for tok in stack if isinstance(tok, tokens.Text)]
  377. return "".join([token.text for token in text]).rstrip().lower()
  378. def _actually_close_tag_opening(self):
  379. """Handle cleanup at the end of a opening tag.
  380. The current context will be updated and the
  381. :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the
  382. opening tag's padding to be used in the
  383. :py:class:`~.tokens.TagOpenClose` token.
  384. """
  385. if self._context & contexts.TAG_OPEN_ATTR:
  386. if self._context & contexts.TAG_OPEN_ATTR_NAME:
  387. self._context ^= contexts.TAG_OPEN_ATTR_NAME
  388. if self._context & contexts.TAG_OPEN_ATTR_BODY:
  389. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  390. else:
  391. self._write_first(tokens.TagOpenOpen(showtag=True))
  392. self._context ^= contexts.TAG_OPEN_NAME
  393. self._context |= contexts.TAG_BODY
  394. self._push_textbuffer()
  395. if isinstance(self._stack[-1], tokens.TagAttrStart):
  396. return self._stack.pop().padding
  397. return ""
  398. def _actually_handle_chunk(self, chunks, is_new):
  399. """Actually handle a chunk of code within a tag's attributes.
  400. Called by :py:meth:`_handle_tag_chunk` and
  401. :py:meth:`_handle_tag_attribute_body`.
  402. """
  403. if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  404. padding = 0
  405. while chunks:
  406. if chunks[0] == "":
  407. padding += 1
  408. chunks.pop(0)
  409. else:
  410. break
  411. self._write(tokens.TagAttrStart(padding=" " * padding))
  412. elif self._context & contexts.TAG_OPEN_ATTR_IGNORE:
  413. self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
  414. chunks.pop(0)
  415. return
  416. elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  417. self._write_text(" ") # Quoted chunks don't lose their spaces
  418. if chunks:
  419. chunk = chunks.pop(0)
  420. if self._context & contexts.TAG_OPEN_ATTR_BODY:
  421. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  422. self._context |= contexts.TAG_OPEN_ATTR_NAME
  423. if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  424. if re.search(r'[^\\]"', chunk[:-1]):
  425. self._fail_route()
  426. if re.search(r'[^\\]"$', chunk):
  427. self._write_text(chunk[:-1])
  428. self._context ^= contexts.TAG_OPEN_ATTR_QUOTED
  429. self._context |= contexts.TAG_OPEN_ATTR_NAME
  430. return True # Back to _handle_tag_attribute_body()
  431. self._write_text(chunk)
  432. def _handle_tag_chunk(self, text):
  433. """Handle a chunk of code within a tag's attributes.
  434. This is called by :py:meth:`_parse`, which intercepts parsing of
  435. wikicode when we're inside of an opening tag and no :py:attr:`MARKERS`
  436. are present.
  437. """
  438. if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  439. self._write_text(text)
  440. return
  441. chunks = text.split(" ")
  442. is_new = False
  443. is_quoted = False
  444. if self._context & contexts.TAG_OPEN_NAME:
  445. self._write_text(chunks.pop(0))
  446. self._write_first(tokens.TagOpenOpen(showtag=True))
  447. self._context ^= contexts.TAG_OPEN_NAME
  448. self._context |= contexts.TAG_OPEN_ATTR_NAME
  449. self._actually_handle_chunk(chunks, True)
  450. is_new = True
  451. while chunks:
  452. result = self._actually_handle_chunk(chunks, is_new)
  453. is_quoted = result or is_quoted
  454. is_new = True
  455. if is_quoted:
  456. return self._pop()
  457. def _handle_tag_attribute_body(self):
  458. """Handle the body, or value, of a tag attribute.
  459. Attribute bodies can usually be handled at once, but sometimes a new
  460. stack must be created to keep track of "rich" attribute values that
  461. contain, for example, templates.
  462. """
  463. self._context ^= contexts.TAG_OPEN_ATTR_NAME
  464. self._context |= contexts.TAG_OPEN_ATTR_BODY
  465. self._write(tokens.TagAttrEquals())
  466. next = self._read(1)
  467. if next not in self.MARKERS and next.startswith('"'):
  468. chunks = None
  469. if " " in next:
  470. chunks = next.split(" ")
  471. next = chunks.pop(0)
  472. if re.search(r'[^\\]"$', next[1:]):
  473. if not re.search(r'[^\\]"', next[1:-1]):
  474. self._write(tokens.TagAttrQuote())
  475. self._write_text(next[1:-1])
  476. self._head += 1
  477. else:
  478. if not re.search(r'[^\\]"', next[1:]):
  479. self._head += 1
  480. reset = self._head
  481. try:
  482. attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED |
  483. contexts.TAG_OPEN_ATTR_IGNORE)
  484. except BadRoute:
  485. self._head = reset
  486. self._write_text(next)
  487. else:
  488. self._write(tokens.TagAttrQuote())
  489. self._write_text(next[1:])
  490. self._write_all(attr)
  491. return
  492. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  493. self._context |= contexts.TAG_OPEN_ATTR_NAME
  494. while chunks:
  495. self._actually_handle_chunk(chunks, True)
  496. def _handle_tag_close_open(self):
  497. """Handle the ending of an open tag (``<foo>``)."""
  498. padding = self._actually_close_tag_opening()
  499. self._write(tokens.TagCloseOpen(padding=padding))
  500. def _handle_tag_selfclose(self):
  501. """Handle the ending of an tag that closes itself (``<foo />``)."""
  502. padding = self._actually_close_tag_opening()
  503. self._write(tokens.TagCloseSelfclose(padding=padding))
  504. self._head += 1
  505. return self._pop()
  506. def _handle_tag_open_close(self):
  507. """Handle the opening of a closing tag (``</foo>``)."""
  508. self._write(tokens.TagOpenClose())
  509. self._push(contexts.TAG_CLOSE)
  510. self._head += 1
  511. def _handle_tag_close_close(self):
  512. """Handle the ending of a closing tag (``</foo>``)."""
  513. closing = self._pop()
  514. if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
  515. # Closing and opening tags are not the same, so fail this route:
  516. self._fail_route()
  517. self._write_all(closing)
  518. self._write(tokens.TagCloseClose())
  519. return self._pop()
  520. def _verify_safe(self, this):
  521. """Make sure we are not trying to write an invalid character."""
  522. context = self._context
  523. if context & contexts.FAIL_NEXT:
  524. return False
  525. if context & contexts.WIKILINK_TITLE:
  526. if this == "]" or this == "{":
  527. self._context |= contexts.FAIL_NEXT
  528. elif this == "\n" or this == "[" or this == "}":
  529. return False
  530. return True
  531. elif context & contexts.TEMPLATE_NAME:
  532. if this == "{" or this == "}" or this == "[":
  533. self._context |= contexts.FAIL_NEXT
  534. return True
  535. if this == "]":
  536. return False
  537. if this == "|":
  538. return True
  539. if context & contexts.HAS_TEXT:
  540. if context & contexts.FAIL_ON_TEXT:
  541. if this is self.END or not this.isspace():
  542. return False
  543. else:
  544. if this == "\n":
  545. self._context |= contexts.FAIL_ON_TEXT
  546. elif this is self.END or not this.isspace():
  547. self._context |= contexts.HAS_TEXT
  548. return True
  549. elif context & contexts.TAG_CLOSE:
  550. return this != "<" and this != "\n"
  551. else:
  552. if context & contexts.FAIL_ON_EQUALS:
  553. if this == "=":
  554. return False
  555. elif context & contexts.FAIL_ON_LBRACE:
  556. if this == "{" or (self._read(-1) == self._read(-2) == "{"):
  557. if context & contexts.TEMPLATE:
  558. self._context |= contexts.FAIL_ON_EQUALS
  559. else:
  560. self._context |= contexts.FAIL_NEXT
  561. return True
  562. self._context ^= contexts.FAIL_ON_LBRACE
  563. elif context & contexts.FAIL_ON_RBRACE:
  564. if this == "}":
  565. if context & contexts.TEMPLATE:
  566. self._context |= contexts.FAIL_ON_EQUALS
  567. else:
  568. self._context |= contexts.FAIL_NEXT
  569. return True
  570. self._context ^= contexts.FAIL_ON_RBRACE
  571. elif this == "{":
  572. self._context |= contexts.FAIL_ON_LBRACE
  573. elif this == "}":
  574. self._context |= contexts.FAIL_ON_RBRACE
  575. return True
  576. def _parse(self, context=0):
  577. """Parse the wikicode string, using *context* for when to stop."""
  578. self._push(context)
  579. while True:
  580. this = self._read()
  581. unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
  582. contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
  583. contexts.TAG_CLOSE)
  584. if self._context & unsafe:
  585. if not self._verify_safe(this):
  586. double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
  587. if self._context & double:
  588. self._pop()
  589. self._fail_route()
  590. if this not in self.MARKERS:
  591. if self._context & contexts.TAG_OPEN:
  592. should_exit = self._handle_tag_chunk(this)
  593. if should_exit:
  594. return should_exit
  595. else:
  596. self._write_text(this)
  597. self._head += 1
  598. continue
  599. if this is self.END:
  600. fail = (
  601. contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
  602. contexts.HEADING | contexts.COMMENT | contexts.TAG)
  603. if self._context & fail:
  604. double_fail = (
  605. contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
  606. contexts.TAG_OPEN_ATTR_QUOTED)
  607. if self._context & double_fail:
  608. self._pop()
  609. self._fail_route()
  610. return self._pop()
  611. next = self._read(1)
  612. if self._context & contexts.COMMENT:
  613. if this == next == "-" and self._read(2) == ">":
  614. return self._pop()
  615. else:
  616. self._write_text(this)
  617. elif this == next == "{":
  618. if self._can_recurse():
  619. self._parse_template_or_argument()
  620. if self._context & contexts.FAIL_NEXT:
  621. self._context ^= contexts.FAIL_NEXT
  622. else:
  623. self._write_text("{")
  624. elif this == "|" and self._context & contexts.TEMPLATE:
  625. self._handle_template_param()
  626. elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
  627. self._handle_template_param_value()
  628. elif this == next == "}" and self._context & contexts.TEMPLATE:
  629. return self._handle_template_end()
  630. elif this == "|" and self._context & contexts.ARGUMENT_NAME:
  631. self._handle_argument_separator()
  632. elif this == next == "}" and self._context & contexts.ARGUMENT:
  633. if self._read(2) == "}":
  634. return self._handle_argument_end()
  635. else:
  636. self._write_text("}")
  637. elif this == next == "[":
  638. if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
  639. self._parse_wikilink()
  640. if self._context & contexts.FAIL_NEXT:
  641. self._context ^= contexts.FAIL_NEXT
  642. else:
  643. self._write_text("[")
  644. elif this == "|" and self._context & contexts.WIKILINK_TITLE:
  645. self._handle_wikilink_separator()
  646. elif this == next == "]" and self._context & contexts.WIKILINK:
  647. return self._handle_wikilink_end()
  648. elif this == "=" and not self._global & contexts.GL_HEADING:
  649. if self._read(-1) in ("\n", self.START):
  650. self._parse_heading()
  651. elif self._context & contexts.TAG_OPEN_ATTR_NAME:
  652. self._handle_tag_attribute_body()
  653. else:
  654. self._write_text("=")
  655. elif this == "=" and self._context & contexts.HEADING:
  656. return self._handle_heading_end()
  657. elif this == "\n" and self._context & contexts.HEADING:
  658. self._fail_route()
  659. elif this == "&":
  660. self._parse_entity()
  661. elif this == "<" and next == "!":
  662. if self._read(2) == self._read(3) == "-":
  663. self._parse_comment()
  664. else:
  665. self._write_text(this)
  666. elif this == "<" and next != "/" and (
  667. not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
  668. self._parse_tag()
  669. elif self._context & contexts.TAG_OPEN:
  670. if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  671. self._handle_tag_chunk(this)
  672. elif this == "\n":
  673. self._fail_route()
  674. elif this == ">":
  675. self._handle_tag_close_open()
  676. elif this == "/" and next == ">":
  677. return self._handle_tag_selfclose()
  678. elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
  679. self._handle_tag_attribute_body()
  680. else:
  681. self._handle_tag_chunk(this)
  682. elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
  683. self._handle_tag_open_close()
  684. elif this == ">" and self._context & contexts.TAG_CLOSE:
  685. return self._handle_tag_close_close()
  686. else:
  687. self._write_text(this)
  688. self._head += 1
  689. def tokenize(self, text):
  690. """Build a list of tokens from a string of wikicode and return it."""
  691. split = self.regex.split(text)
  692. self._text = [segment for segment in split if segment]
  693. return self._parse()