A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

771 lines
29 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from math import log
  24. import re
  25. from . import contexts
  26. from . import tokens
  27. from ..compat import htmlentities
  28. from ..nodes.tag import Tag
  29. __all__ = ["Tokenizer"]
  30. class BadRoute(Exception):
  31. """Raised internally when the current tokenization route is invalid."""
  32. pass
  33. class Tokenizer(object):
  34. """Creates a list of tokens from a string of wikicode."""
  35. USES_C = False
  36. START = object()
  37. END = object()
  38. MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
  39. "/", "-", "!", "\n", END]
  40. MAX_DEPTH = 40
  41. MAX_CYCLES = 100000
  42. regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
  43. def __init__(self):
  44. self._text = None
  45. self._head = 0
  46. self._stacks = []
  47. self._global = 0
  48. self._depth = 0
  49. self._cycles = 0
  50. @property
  51. def _stack(self):
  52. """The current token stack."""
  53. return self._stacks[-1][0]
  54. @property
  55. def _context(self):
  56. """The current token context."""
  57. return self._stacks[-1][1]
  58. @_context.setter
  59. def _context(self, value):
  60. self._stacks[-1][1] = value
  61. @property
  62. def _textbuffer(self):
  63. """The current textbuffer."""
  64. return self._stacks[-1][2]
  65. @_textbuffer.setter
  66. def _textbuffer(self, value):
  67. self._stacks[-1][2] = value
  68. def _push(self, context=0):
  69. """Add a new token stack, context, and textbuffer to the list."""
  70. self._stacks.append([[], context, []])
  71. self._depth += 1
  72. self._cycles += 1
  73. def _push_textbuffer(self):
  74. """Push the textbuffer onto the stack as a Text node and clear it."""
  75. if self._textbuffer:
  76. self._stack.append(tokens.Text(text="".join(self._textbuffer)))
  77. self._textbuffer = []
  78. def _pop(self, keep_context=False):
  79. """Pop the current stack/context/textbuffer, returing the stack.
  80. If *keep_context* is ``True``, then we will replace the underlying
  81. stack's context with the current stack's.
  82. """
  83. self._push_textbuffer()
  84. self._depth -= 1
  85. if keep_context:
  86. context = self._context
  87. stack = self._stacks.pop()[0]
  88. self._context = context
  89. return stack
  90. return self._stacks.pop()[0]
  91. def _can_recurse(self):
  92. """Return whether or not our max recursion depth has been exceeded."""
  93. return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES
  94. def _fail_route(self):
  95. """Fail the current tokenization route.
  96. Discards the current stack/context/textbuffer and raises
  97. :py:exc:`~.BadRoute`.
  98. """
  99. self._pop()
  100. raise BadRoute()
  101. def _write(self, token):
  102. """Write a token to the end of the current token stack."""
  103. self._push_textbuffer()
  104. self._stack.append(token)
  105. def _write_first(self, token):
  106. """Write a token to the beginning of the current token stack."""
  107. self._push_textbuffer()
  108. self._stack.insert(0, token)
  109. def _write_text(self, text):
  110. """Write text to the current textbuffer."""
  111. self._textbuffer.append(text)
  112. def _write_all(self, tokenlist):
  113. """Write a series of tokens to the current stack at once."""
  114. if tokenlist and isinstance(tokenlist[0], tokens.Text):
  115. self._write_text(tokenlist.pop(0).text)
  116. self._push_textbuffer()
  117. self._stack.extend(tokenlist)
  118. def _write_text_then_stack(self, text):
  119. """Pop the current stack, write *text*, and then write the stack."""
  120. stack = self._pop()
  121. self._write_text(text)
  122. if stack:
  123. self._write_all(stack)
  124. self._head -= 1
  125. def _read(self, delta=0, wrap=False, strict=False):
  126. """Read the value at a relative point in the wikicode.
  127. The value is read from :py:attr:`self._head <_head>` plus the value of
  128. *delta* (which can be negative). If *wrap* is ``False``, we will not
  129. allow attempts to read from the end of the string if ``self._head +
  130. delta`` is negative. If *strict* is ``True``, the route will be failed
  131. (with :py:meth:`_fail_route`) if we try to read from past the end of
  132. the string; otherwise, :py:attr:`self.END <END>` is returned. If we try
  133. to read from before the start of the string, :py:attr:`self.START
  134. <START>` is returned.
  135. """
  136. index = self._head + delta
  137. if index < 0 and (not wrap or abs(index) > len(self._text)):
  138. return self.START
  139. try:
  140. return self._text[index]
  141. except IndexError:
  142. if strict:
  143. self._fail_route()
  144. return self.END
  145. def _parse_template_or_argument(self):
  146. """Parse a template or argument at the head of the wikicode string."""
  147. self._head += 2
  148. braces = 2
  149. while self._read() == "{":
  150. self._head += 1
  151. braces += 1
  152. self._push()
  153. while braces:
  154. if braces == 1:
  155. return self._write_text_then_stack("{")
  156. if braces == 2:
  157. try:
  158. self._parse_template()
  159. except BadRoute:
  160. return self._write_text_then_stack("{{")
  161. break
  162. try:
  163. self._parse_argument()
  164. braces -= 3
  165. except BadRoute:
  166. try:
  167. self._parse_template()
  168. braces -= 2
  169. except BadRoute:
  170. return self._write_text_then_stack("{" * braces)
  171. if braces:
  172. self._head += 1
  173. self._write_all(self._pop())
  174. def _parse_template(self):
  175. """Parse a template at the head of the wikicode string."""
  176. reset = self._head
  177. try:
  178. template = self._parse(contexts.TEMPLATE_NAME)
  179. except BadRoute:
  180. self._head = reset
  181. raise
  182. self._write_first(tokens.TemplateOpen())
  183. self._write_all(template)
  184. self._write(tokens.TemplateClose())
  185. def _parse_argument(self):
  186. """Parse an argument at the head of the wikicode string."""
  187. reset = self._head
  188. try:
  189. argument = self._parse(contexts.ARGUMENT_NAME)
  190. except BadRoute:
  191. self._head = reset
  192. raise
  193. self._write_first(tokens.ArgumentOpen())
  194. self._write_all(argument)
  195. self._write(tokens.ArgumentClose())
  196. def _handle_template_param(self):
  197. """Handle a template parameter at the head of the string."""
  198. if self._context & contexts.TEMPLATE_NAME:
  199. self._context ^= contexts.TEMPLATE_NAME
  200. elif self._context & contexts.TEMPLATE_PARAM_VALUE:
  201. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  202. elif self._context & contexts.TEMPLATE_PARAM_KEY:
  203. self._write_all(self._pop(keep_context=True))
  204. self._context |= contexts.TEMPLATE_PARAM_KEY
  205. self._write(tokens.TemplateParamSeparator())
  206. self._push(self._context)
  207. def _handle_template_param_value(self):
  208. """Handle a template parameter's value at the head of the string."""
  209. self._write_all(self._pop(keep_context=True))
  210. self._context ^= contexts.TEMPLATE_PARAM_KEY
  211. self._context |= contexts.TEMPLATE_PARAM_VALUE
  212. self._write(tokens.TemplateParamEquals())
  213. def _handle_template_end(self):
  214. """Handle the end of a template at the head of the string."""
  215. if self._context & contexts.TEMPLATE_PARAM_KEY:
  216. self._write_all(self._pop(keep_context=True))
  217. self._head += 1
  218. return self._pop()
  219. def _handle_argument_separator(self):
  220. """Handle the separator between an argument's name and default."""
  221. self._context ^= contexts.ARGUMENT_NAME
  222. self._context |= contexts.ARGUMENT_DEFAULT
  223. self._write(tokens.ArgumentSeparator())
  224. def _handle_argument_end(self):
  225. """Handle the end of an argument at the head of the string."""
  226. self._head += 2
  227. return self._pop()
  228. def _parse_wikilink(self):
  229. """Parse an internal wikilink at the head of the wikicode string."""
  230. self._head += 2
  231. reset = self._head - 1
  232. try:
  233. wikilink = self._parse(contexts.WIKILINK_TITLE)
  234. except BadRoute:
  235. self._head = reset
  236. self._write_text("[[")
  237. else:
  238. self._write(tokens.WikilinkOpen())
  239. self._write_all(wikilink)
  240. self._write(tokens.WikilinkClose())
  241. def _handle_wikilink_separator(self):
  242. """Handle the separator between a wikilink's title and its text."""
  243. self._context ^= contexts.WIKILINK_TITLE
  244. self._context |= contexts.WIKILINK_TEXT
  245. self._write(tokens.WikilinkSeparator())
  246. def _handle_wikilink_end(self):
  247. """Handle the end of a wikilink at the head of the string."""
  248. self._head += 1
  249. return self._pop()
  250. def _parse_heading(self):
  251. """Parse a section heading at the head of the wikicode string."""
  252. self._global |= contexts.GL_HEADING
  253. reset = self._head
  254. self._head += 1
  255. best = 1
  256. while self._read() == "=":
  257. best += 1
  258. self._head += 1
  259. context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
  260. try:
  261. title, level = self._parse(context)
  262. except BadRoute:
  263. self._head = reset + best - 1
  264. self._write_text("=" * best)
  265. else:
  266. self._write(tokens.HeadingStart(level=level))
  267. if level < best:
  268. self._write_text("=" * (best - level))
  269. self._write_all(title)
  270. self._write(tokens.HeadingEnd())
  271. finally:
  272. self._global ^= contexts.GL_HEADING
  273. def _handle_heading_end(self):
  274. """Handle the end of a section heading at the head of the string."""
  275. reset = self._head
  276. self._head += 1
  277. best = 1
  278. while self._read() == "=":
  279. best += 1
  280. self._head += 1
  281. current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
  282. level = min(current, min(best, 6))
  283. try: # Try to check for a heading closure after this one
  284. after, after_level = self._parse(self._context)
  285. except BadRoute:
  286. if level < best:
  287. self._write_text("=" * (best - level))
  288. self._head = reset + best - 1
  289. return self._pop(), level
  290. else: # Found another closure
  291. self._write_text("=" * best)
  292. self._write_all(after)
  293. return self._pop(), after_level
  294. def _really_parse_entity(self):
  295. """Actually parse an HTML entity and ensure that it is valid."""
  296. self._write(tokens.HTMLEntityStart())
  297. self._head += 1
  298. this = self._read(strict=True)
  299. if this == "#":
  300. numeric = True
  301. self._write(tokens.HTMLEntityNumeric())
  302. self._head += 1
  303. this = self._read(strict=True)
  304. if this[0].lower() == "x":
  305. hexadecimal = True
  306. self._write(tokens.HTMLEntityHex(char=this[0]))
  307. this = this[1:]
  308. if not this:
  309. self._fail_route()
  310. else:
  311. hexadecimal = False
  312. else:
  313. numeric = hexadecimal = False
  314. valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789"
  315. if not numeric and not hexadecimal:
  316. valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  317. if not all([char in valid for char in this]):
  318. self._fail_route()
  319. self._head += 1
  320. if self._read() != ";":
  321. self._fail_route()
  322. if numeric:
  323. test = int(this, 16) if hexadecimal else int(this)
  324. if test < 1 or test > 0x10FFFF:
  325. self._fail_route()
  326. else:
  327. if this not in htmlentities.entitydefs:
  328. self._fail_route()
  329. self._write(tokens.Text(text=this))
  330. self._write(tokens.HTMLEntityEnd())
  331. def _parse_entity(self):
  332. """Parse an HTML entity at the head of the wikicode string."""
  333. reset = self._head
  334. self._push()
  335. try:
  336. self._really_parse_entity()
  337. except BadRoute:
  338. self._head = reset
  339. self._write_text(self._read())
  340. else:
  341. self._write_all(self._pop())
  342. def _parse_comment(self):
  343. """Parse an HTML comment at the head of the wikicode string."""
  344. self._head += 4
  345. reset = self._head - 1
  346. try:
  347. comment = self._parse(contexts.COMMENT)
  348. except BadRoute:
  349. self._head = reset
  350. self._write_text("<!--")
  351. else:
  352. self._write(tokens.CommentStart())
  353. self._write_all(comment)
  354. self._write(tokens.CommentEnd())
  355. self._head += 2
  356. def _parse_tag(self):
  357. """Parse an HTML tag at the head of the wikicode string."""
  358. reset = self._head
  359. self._head += 1
  360. try:
  361. tokens = self._parse(contexts.TAG_OPEN_NAME)
  362. except BadRoute:
  363. self._head = reset
  364. self._write_text("<")
  365. else:
  366. self._write_all(tokens)
  367. def _get_tag_type_from_stack(self, stack=None):
  368. """Return the tag type based on the text in *stack*.
  369. If *stack* is ``None``, we will use the current, topmost one.
  370. """
  371. if stack is None:
  372. stack = self._stack
  373. self._push_textbuffer()
  374. if not stack:
  375. self._fail_route() # Tag has an empty name?
  376. text = [tok for tok in stack if isinstance(tok, tokens.Text)]
  377. text = "".join([token.text for token in text]).rstrip().lower()
  378. try:
  379. return Tag.TRANSLATIONS[text]
  380. except KeyError:
  381. return Tag.TAG_UNKNOWN
  382. def _actually_close_tag_opening(self):
  383. """Handle cleanup at the end of a opening tag.
  384. The current context will be updated and the
  385. :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the
  386. opening tag's padding to be used in the
  387. :py:class:`~.tokens.TagOpenClose` token.
  388. """
  389. if self._context & contexts.TAG_OPEN_ATTR:
  390. if self._context & contexts.TAG_OPEN_ATTR_NAME:
  391. self._context ^= contexts.TAG_OPEN_ATTR_NAME
  392. if self._context & contexts.TAG_OPEN_ATTR_BODY:
  393. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  394. else:
  395. tag = self._get_tag_type_from_stack()
  396. self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
  397. self._context ^= contexts.TAG_OPEN_NAME
  398. self._context |= contexts.TAG_BODY
  399. self._push_textbuffer()
  400. if isinstance(self._stack[-1], tokens.TagAttrStart):
  401. return self._stack.pop().padding
  402. return ""
  403. def _actually_handle_chunk(self, chunks, is_new):
  404. """Actually handle a chunk of code within a tag's attributes.
  405. Called by :py:meth:`_handle_tag_chunk` and
  406. :py:meth:`_handle_tag_attribute_body`.
  407. """
  408. if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  409. padding = 0
  410. while chunks:
  411. if chunks[0] == "":
  412. padding += 1
  413. chunks.pop(0)
  414. else:
  415. break
  416. self._write(tokens.TagAttrStart(padding=" " * padding))
  417. elif self._context & contexts.TAG_OPEN_ATTR_IGNORE:
  418. self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
  419. chunks.pop(0)
  420. return
  421. elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  422. self._write_text(" ") # Quoted chunks don't lose their spaces
  423. if chunks:
  424. chunk = chunks.pop(0)
  425. if self._context & contexts.TAG_OPEN_ATTR_BODY:
  426. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  427. self._context |= contexts.TAG_OPEN_ATTR_NAME
  428. if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  429. if re.search(r'[^\\]"', chunk[:-1]):
  430. self._fail_route()
  431. if re.search(r'[^\\]"$', chunk):
  432. self._write_text(chunk[:-1])
  433. self._context ^= contexts.TAG_OPEN_ATTR_QUOTED
  434. self._context |= contexts.TAG_OPEN_ATTR_NAME
  435. return True # Back to _handle_tag_attribute_body()
  436. self._write_text(chunk)
  437. def _handle_tag_chunk(self, text):
  438. """Handle a chunk of code within a tag's attributes.
  439. This is called by :py:meth:`_parse`, which intercepts parsing of
  440. wikicode when we're inside of an opening tag and no :py:attr:`MARKERS`
  441. are present.
  442. """
  443. if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  444. self._write_text(text)
  445. return
  446. chunks = text.split(" ")
  447. is_new = False
  448. is_quoted = False
  449. if self._context & contexts.TAG_OPEN_NAME:
  450. self._write_text(chunks.pop(0))
  451. tag = self._get_tag_type_from_stack()
  452. self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
  453. self._context ^= contexts.TAG_OPEN_NAME
  454. self._context |= contexts.TAG_OPEN_ATTR_NAME
  455. self._actually_handle_chunk(chunks, True)
  456. is_new = True
  457. while chunks:
  458. result = self._actually_handle_chunk(chunks, is_new)
  459. is_quoted = result or is_quoted
  460. is_new = True
  461. if is_quoted:
  462. return self._pop()
  463. def _handle_tag_attribute_body(self):
  464. """Handle the body, or value, of a tag attribute.
  465. Attribute bodies can usually be handled at once, but sometimes a new
  466. stack must be created to keep track of "rich" attribute values that
  467. contain, for example, templates.
  468. """
  469. self._context ^= contexts.TAG_OPEN_ATTR_NAME
  470. self._context |= contexts.TAG_OPEN_ATTR_BODY
  471. self._write(tokens.TagAttrEquals())
  472. next = self._read(1)
  473. if next not in self.MARKERS and next.startswith('"'):
  474. chunks = None
  475. if " " in next:
  476. chunks = next.split(" ")
  477. next = chunks.pop(0)
  478. if re.search(r'[^\\]"$', next[1:]):
  479. if not re.search(r'[^\\]"', next[1:-1]):
  480. self._write(tokens.TagAttrQuote())
  481. self._write_text(next[1:-1])
  482. self._head += 1
  483. else:
  484. if not re.search(r'[^\\]"', next[1:]):
  485. self._head += 1
  486. reset = self._head
  487. try:
  488. attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED |
  489. contexts.TAG_OPEN_ATTR_IGNORE)
  490. except BadRoute:
  491. self._head = reset
  492. self._write_text(next)
  493. else:
  494. self._write(tokens.TagAttrQuote())
  495. self._write_text(next[1:])
  496. self._write_all(attr)
  497. return
  498. self._context ^= contexts.TAG_OPEN_ATTR_BODY
  499. self._context |= contexts.TAG_OPEN_ATTR_NAME
  500. while chunks:
  501. self._actually_handle_chunk(chunks, True)
  502. def _handle_tag_close_open(self):
  503. """Handle the ending of an open tag (``<foo>``)."""
  504. padding = self._actually_close_tag_opening()
  505. self._write(tokens.TagCloseOpen(padding=padding))
  506. def _handle_tag_selfclose(self):
  507. """Handle the ending of an tag that closes itself (``<foo />``)."""
  508. padding = self._actually_close_tag_opening()
  509. self._write(tokens.TagCloseSelfclose(padding=padding))
  510. self._head += 1
  511. return self._pop()
  512. def _handle_tag_open_close(self):
  513. """Handle the opening of a closing tag (``</foo>``)."""
  514. self._write(tokens.TagOpenClose())
  515. self._push(contexts.TAG_CLOSE)
  516. self._head += 1
  517. def _handle_tag_close_close(self):
  518. """Handle the ending of a closing tag (``</foo>``)."""
  519. closing = self._pop()
  520. tag = self._get_tag_type_from_stack(closing)
  521. if tag != self._stack[0].type:
  522. # Closing and opening tags are not the same, so fail this route:
  523. self._fail_route()
  524. self._write_all(closing)
  525. self._write(tokens.TagCloseClose())
  526. return self._pop()
  527. def _verify_safe(self, this):
  528. """Make sure we are not trying to write an invalid character."""
  529. context = self._context
  530. if context & contexts.FAIL_NEXT:
  531. return False
  532. if context & contexts.WIKILINK_TITLE:
  533. if this == "]" or this == "{":
  534. self._context |= contexts.FAIL_NEXT
  535. elif this == "\n" or this == "[" or this == "}":
  536. return False
  537. return True
  538. elif context & contexts.TEMPLATE_NAME:
  539. if this == "{" or this == "}" or this == "[":
  540. self._context |= contexts.FAIL_NEXT
  541. return True
  542. if this == "]":
  543. return False
  544. if this == "|":
  545. return True
  546. if context & contexts.HAS_TEXT:
  547. if context & contexts.FAIL_ON_TEXT:
  548. if this is self.END or not this.isspace():
  549. return False
  550. else:
  551. if this == "\n":
  552. self._context |= contexts.FAIL_ON_TEXT
  553. elif this is self.END or not this.isspace():
  554. self._context |= contexts.HAS_TEXT
  555. return True
  556. elif context & contexts.TAG_CLOSE:
  557. return this != "<" and this != "\n"
  558. else:
  559. if context & contexts.FAIL_ON_EQUALS:
  560. if this == "=":
  561. return False
  562. elif context & contexts.FAIL_ON_LBRACE:
  563. if this == "{" or (self._read(-1) == self._read(-2) == "{"):
  564. if context & contexts.TEMPLATE:
  565. self._context |= contexts.FAIL_ON_EQUALS
  566. else:
  567. self._context |= contexts.FAIL_NEXT
  568. return True
  569. self._context ^= contexts.FAIL_ON_LBRACE
  570. elif context & contexts.FAIL_ON_RBRACE:
  571. if this == "}":
  572. if context & contexts.TEMPLATE:
  573. self._context |= contexts.FAIL_ON_EQUALS
  574. else:
  575. self._context |= contexts.FAIL_NEXT
  576. return True
  577. self._context ^= contexts.FAIL_ON_RBRACE
  578. elif this == "{":
  579. self._context |= contexts.FAIL_ON_LBRACE
  580. elif this == "}":
  581. self._context |= contexts.FAIL_ON_RBRACE
  582. return True
  583. def _parse(self, context=0):
  584. """Parse the wikicode string, using *context* for when to stop."""
  585. self._push(context)
  586. while True:
  587. this = self._read()
  588. unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
  589. contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
  590. contexts.TAG_CLOSE)
  591. if self._context & unsafe:
  592. if not self._verify_safe(this):
  593. double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
  594. if self._context & double:
  595. self._pop()
  596. self._fail_route()
  597. if this not in self.MARKERS:
  598. if self._context & contexts.TAG_OPEN:
  599. should_exit = self._handle_tag_chunk(this)
  600. if should_exit:
  601. return should_exit
  602. else:
  603. self._write_text(this)
  604. self._head += 1
  605. continue
  606. if this is self.END:
  607. fail = (
  608. contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
  609. contexts.HEADING | contexts.COMMENT | contexts.TAG)
  610. if self._context & fail:
  611. double_fail = (
  612. contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
  613. contexts.TAG_OPEN_ATTR_QUOTED)
  614. if self._context & double_fail:
  615. self._pop()
  616. self._fail_route()
  617. return self._pop()
  618. next = self._read(1)
  619. if self._context & contexts.COMMENT:
  620. if this == next == "-" and self._read(2) == ">":
  621. return self._pop()
  622. else:
  623. self._write_text(this)
  624. elif this == next == "{":
  625. if self._can_recurse():
  626. self._parse_template_or_argument()
  627. if self._context & contexts.FAIL_NEXT:
  628. self._context ^= contexts.FAIL_NEXT
  629. else:
  630. self._write_text("{")
  631. elif this == "|" and self._context & contexts.TEMPLATE:
  632. self._handle_template_param()
  633. elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
  634. self._handle_template_param_value()
  635. elif this == next == "}" and self._context & contexts.TEMPLATE:
  636. return self._handle_template_end()
  637. elif this == "|" and self._context & contexts.ARGUMENT_NAME:
  638. self._handle_argument_separator()
  639. elif this == next == "}" and self._context & contexts.ARGUMENT:
  640. if self._read(2) == "}":
  641. return self._handle_argument_end()
  642. else:
  643. self._write_text("}")
  644. elif this == next == "[":
  645. if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
  646. self._parse_wikilink()
  647. if self._context & contexts.FAIL_NEXT:
  648. self._context ^= contexts.FAIL_NEXT
  649. else:
  650. self._write_text("[")
  651. elif this == "|" and self._context & contexts.WIKILINK_TITLE:
  652. self._handle_wikilink_separator()
  653. elif this == next == "]" and self._context & contexts.WIKILINK:
  654. return self._handle_wikilink_end()
  655. elif this == "=" and not self._global & contexts.GL_HEADING:
  656. if self._read(-1) in ("\n", self.START):
  657. self._parse_heading()
  658. elif self._context & contexts.TAG_OPEN_ATTR_NAME:
  659. self._handle_tag_attribute_body()
  660. else:
  661. self._write_text("=")
  662. elif this == "=" and self._context & contexts.HEADING:
  663. return self._handle_heading_end()
  664. elif this == "\n" and self._context & contexts.HEADING:
  665. self._fail_route()
  666. elif this == "&":
  667. self._parse_entity()
  668. elif this == "<" and next == "!":
  669. if self._read(2) == self._read(3) == "-":
  670. self._parse_comment()
  671. else:
  672. self._write_text(this)
  673. elif this == "<" and next != "/" and (
  674. not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
  675. self._parse_tag()
  676. elif self._context & contexts.TAG_OPEN:
  677. if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
  678. self._handle_tag_chunk(this)
  679. elif this == "\n":
  680. self._fail_route()
  681. elif this == ">":
  682. self._handle_tag_close_open()
  683. elif this == "/" and next == ">":
  684. return self._handle_tag_selfclose()
  685. elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
  686. self._handle_tag_attribute_body()
  687. else:
  688. self._handle_tag_chunk(this)
  689. elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
  690. self._handle_tag_open_close()
  691. elif this == ">" and self._context & contexts.TAG_CLOSE:
  692. return self._handle_tag_close_close()
  693. else:
  694. self._write_text(this)
  695. self._head += 1
  696. def tokenize(self, text):
  697. """Build a list of tokens from a string of wikicode and return it."""
  698. split = self.regex.split(text)
  699. self._text = [segment for segment in split if segment]
  700. return self._parse()