A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.
 
 
 
 

548 рядки
20 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from math import log
  24. import re
  25. from . import contexts
  26. from . import tokens
  27. from ..compat import htmlentities
  28. __all__ = ["Tokenizer"]
  29. class BadRoute(Exception):
  30. """Raised internally when the current tokenization route is invalid."""
  31. pass
  32. class Tokenizer(object):
  33. """Creates a list of tokens from a string of wikicode."""
  34. USES_C = False
  35. START = object()
  36. END = object()
  37. MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
  38. "/", "-", "!", "\n", END]
  39. MAX_DEPTH = 40
  40. MAX_CYCLES = 100000
  41. regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
  42. def __init__(self):
  43. self._text = None
  44. self._head = 0
  45. self._stacks = []
  46. self._global = 0
  47. self._depth = 0
  48. self._cycles = 0
  49. @property
  50. def _stack(self):
  51. """The current token stack."""
  52. return self._stacks[-1][0]
  53. @property
  54. def _context(self):
  55. """The current token context."""
  56. return self._stacks[-1][1]
  57. @_context.setter
  58. def _context(self, value):
  59. self._stacks[-1][1] = value
  60. @property
  61. def _textbuffer(self):
  62. """The current textbuffer."""
  63. return self._stacks[-1][2]
  64. @_textbuffer.setter
  65. def _textbuffer(self, value):
  66. self._stacks[-1][2] = value
  67. def _push(self, context=0):
  68. """Add a new token stack, context, and textbuffer to the list."""
  69. self._stacks.append([[], context, []])
  70. self._depth += 1
  71. self._cycles += 1
  72. def _push_textbuffer(self):
  73. """Push the textbuffer onto the stack as a Text node and clear it."""
  74. if self._textbuffer:
  75. self._stack.append(tokens.Text(text="".join(self._textbuffer)))
  76. self._textbuffer = []
  77. def _pop(self, keep_context=False):
  78. """Pop the current stack/context/textbuffer, returing the stack.
  79. If *keep_context* is ``True``, then we will replace the underlying
  80. stack's context with the current stack's.
  81. """
  82. self._push_textbuffer()
  83. self._depth -= 1
  84. if keep_context:
  85. context = self._context
  86. stack = self._stacks.pop()[0]
  87. self._context = context
  88. return stack
  89. return self._stacks.pop()[0]
  90. def _can_recurse(self):
  91. """Return whether or not our max recursion depth has been exceeded."""
  92. return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES
  93. def _fail_route(self):
  94. """Fail the current tokenization route.
  95. Discards the current stack/context/textbuffer and raises
  96. :py:exc:`~.BadRoute`.
  97. """
  98. self._pop()
  99. raise BadRoute()
  100. def _write(self, token):
  101. """Write a token to the end of the current token stack."""
  102. self._push_textbuffer()
  103. self._stack.append(token)
  104. def _write_first(self, token):
  105. """Write a token to the beginning of the current token stack."""
  106. self._push_textbuffer()
  107. self._stack.insert(0, token)
  108. def _write_text(self, text):
  109. """Write text to the current textbuffer."""
  110. self._textbuffer.append(text)
  111. def _write_all(self, tokenlist):
  112. """Write a series of tokens to the current stack at once."""
  113. if tokenlist and isinstance(tokenlist[0], tokens.Text):
  114. self._write_text(tokenlist.pop(0).text)
  115. self._push_textbuffer()
  116. self._stack.extend(tokenlist)
  117. def _write_text_then_stack(self, text):
  118. """Pop the current stack, write *text*, and then write the stack."""
  119. stack = self._pop()
  120. self._write_text(text)
  121. if stack:
  122. self._write_all(stack)
  123. self._head -= 1
  124. def _read(self, delta=0, wrap=False, strict=False):
  125. """Read the value at a relative point in the wikicode.
  126. The value is read from :py:attr:`self._head <_head>` plus the value of
  127. *delta* (which can be negative). If *wrap* is ``False``, we will not
  128. allow attempts to read from the end of the string if ``self._head +
  129. delta`` is negative. If *strict* is ``True``, the route will be failed
  130. (with :py:meth:`_fail_route`) if we try to read from past the end of
  131. the string; otherwise, :py:attr:`self.END <END>` is returned. If we try
  132. to read from before the start of the string, :py:attr:`self.START
  133. <START>` is returned.
  134. """
  135. index = self._head + delta
  136. if index < 0 and (not wrap or abs(index) > len(self._text)):
  137. return self.START
  138. try:
  139. return self._text[index]
  140. except IndexError:
  141. if strict:
  142. self._fail_route()
  143. return self.END
  144. def _parse_template_or_argument(self):
  145. """Parse a template or argument at the head of the wikicode string."""
  146. self._head += 2
  147. braces = 2
  148. while self._read() == "{":
  149. self._head += 1
  150. braces += 1
  151. self._push()
  152. while braces:
  153. if braces == 1:
  154. return self._write_text_then_stack("{")
  155. if braces == 2:
  156. try:
  157. self._parse_template()
  158. except BadRoute:
  159. return self._write_text_then_stack("{{")
  160. break
  161. try:
  162. self._parse_argument()
  163. braces -= 3
  164. except BadRoute:
  165. try:
  166. self._parse_template()
  167. braces -= 2
  168. except BadRoute:
  169. return self._write_text_then_stack("{" * braces)
  170. if braces:
  171. self._head += 1
  172. self._write_all(self._pop())
  173. def _parse_template(self):
  174. """Parse a template at the head of the wikicode string."""
  175. reset = self._head
  176. try:
  177. template = self._parse(contexts.TEMPLATE_NAME)
  178. except BadRoute:
  179. self._head = reset
  180. raise
  181. self._write_first(tokens.TemplateOpen())
  182. self._write_all(template)
  183. self._write(tokens.TemplateClose())
  184. def _parse_argument(self):
  185. """Parse an argument at the head of the wikicode string."""
  186. reset = self._head
  187. try:
  188. argument = self._parse(contexts.ARGUMENT_NAME)
  189. except BadRoute:
  190. self._head = reset
  191. raise
  192. self._write_first(tokens.ArgumentOpen())
  193. self._write_all(argument)
  194. self._write(tokens.ArgumentClose())
  195. def _handle_template_param(self):
  196. """Handle a template parameter at the head of the string."""
  197. if self._context & contexts.TEMPLATE_NAME:
  198. self._context ^= contexts.TEMPLATE_NAME
  199. elif self._context & contexts.TEMPLATE_PARAM_VALUE:
  200. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  201. elif self._context & contexts.TEMPLATE_PARAM_KEY:
  202. self._write_all(self._pop(keep_context=True))
  203. self._context |= contexts.TEMPLATE_PARAM_KEY
  204. self._write(tokens.TemplateParamSeparator())
  205. self._push(self._context)
  206. def _handle_template_param_value(self):
  207. """Handle a template parameter's value at the head of the string."""
  208. self._write_all(self._pop(keep_context=True))
  209. self._context ^= contexts.TEMPLATE_PARAM_KEY
  210. self._context |= contexts.TEMPLATE_PARAM_VALUE
  211. self._write(tokens.TemplateParamEquals())
  212. def _handle_template_end(self):
  213. """Handle the end of a template at the head of the string."""
  214. if self._context & contexts.TEMPLATE_PARAM_KEY:
  215. self._write_all(self._pop(keep_context=True))
  216. self._head += 1
  217. return self._pop()
  218. def _handle_argument_separator(self):
  219. """Handle the separator between an argument's name and default."""
  220. self._context ^= contexts.ARGUMENT_NAME
  221. self._context |= contexts.ARGUMENT_DEFAULT
  222. self._write(tokens.ArgumentSeparator())
  223. def _handle_argument_end(self):
  224. """Handle the end of an argument at the head of the string."""
  225. self._head += 2
  226. return self._pop()
  227. def _parse_wikilink(self):
  228. """Parse an internal wikilink at the head of the wikicode string."""
  229. self._head += 2
  230. reset = self._head - 1
  231. try:
  232. wikilink = self._parse(contexts.WIKILINK_TITLE)
  233. except BadRoute:
  234. self._head = reset
  235. self._write_text("[[")
  236. else:
  237. self._write(tokens.WikilinkOpen())
  238. self._write_all(wikilink)
  239. self._write(tokens.WikilinkClose())
  240. def _handle_wikilink_separator(self):
  241. """Handle the separator between a wikilink's title and its text."""
  242. self._context ^= contexts.WIKILINK_TITLE
  243. self._context |= contexts.WIKILINK_TEXT
  244. self._write(tokens.WikilinkSeparator())
  245. def _handle_wikilink_end(self):
  246. """Handle the end of a wikilink at the head of the string."""
  247. self._head += 1
  248. return self._pop()
  249. def _parse_heading(self):
  250. """Parse a section heading at the head of the wikicode string."""
  251. self._global |= contexts.GL_HEADING
  252. reset = self._head
  253. self._head += 1
  254. best = 1
  255. while self._read() == "=":
  256. best += 1
  257. self._head += 1
  258. context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
  259. try:
  260. title, level = self._parse(context)
  261. except BadRoute:
  262. self._head = reset + best - 1
  263. self._write_text("=" * best)
  264. else:
  265. self._write(tokens.HeadingStart(level=level))
  266. if level < best:
  267. self._write_text("=" * (best - level))
  268. self._write_all(title)
  269. self._write(tokens.HeadingEnd())
  270. finally:
  271. self._global ^= contexts.GL_HEADING
  272. def _handle_heading_end(self):
  273. """Handle the end of a section heading at the head of the string."""
  274. reset = self._head
  275. self._head += 1
  276. best = 1
  277. while self._read() == "=":
  278. best += 1
  279. self._head += 1
  280. current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
  281. level = min(current, min(best, 6))
  282. try: # Try to check for a heading closure after this one
  283. after, after_level = self._parse(self._context)
  284. except BadRoute:
  285. if level < best:
  286. self._write_text("=" * (best - level))
  287. self._head = reset + best - 1
  288. return self._pop(), level
  289. else: # Found another closure
  290. self._write_text("=" * best)
  291. self._write_all(after)
  292. return self._pop(), after_level
  293. def _really_parse_entity(self):
  294. """Actually parse an HTML entity and ensure that it is valid."""
  295. self._write(tokens.HTMLEntityStart())
  296. self._head += 1
  297. this = self._read(strict=True)
  298. if this == "#":
  299. numeric = True
  300. self._write(tokens.HTMLEntityNumeric())
  301. self._head += 1
  302. this = self._read(strict=True)
  303. if this[0].lower() == "x":
  304. hexadecimal = True
  305. self._write(tokens.HTMLEntityHex(char=this[0]))
  306. this = this[1:]
  307. if not this:
  308. self._fail_route()
  309. else:
  310. hexadecimal = False
  311. else:
  312. numeric = hexadecimal = False
  313. valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789"
  314. if not numeric and not hexadecimal:
  315. valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  316. if not all([char in valid for char in this]):
  317. self._fail_route()
  318. self._head += 1
  319. if self._read() != ";":
  320. self._fail_route()
  321. if numeric:
  322. test = int(this, 16) if hexadecimal else int(this)
  323. if test < 1 or test > 0x10FFFF:
  324. self._fail_route()
  325. else:
  326. if this not in htmlentities.entitydefs:
  327. self._fail_route()
  328. self._write(tokens.Text(text=this))
  329. self._write(tokens.HTMLEntityEnd())
  330. def _parse_entity(self):
  331. """Parse an HTML entity at the head of the wikicode string."""
  332. reset = self._head
  333. self._push()
  334. try:
  335. self._really_parse_entity()
  336. except BadRoute:
  337. self._head = reset
  338. self._write_text(self._read())
  339. else:
  340. self._write_all(self._pop())
  341. def _parse_comment(self):
  342. """Parse an HTML comment at the head of the wikicode string."""
  343. self._head += 4
  344. reset = self._head - 1
  345. try:
  346. comment = self._parse(contexts.COMMENT)
  347. except BadRoute:
  348. self._head = reset
  349. self._write_text("<!--")
  350. else:
  351. self._write(tokens.CommentStart())
  352. self._write_all(comment)
  353. self._write(tokens.CommentEnd())
  354. self._head += 2
  355. def _verify_safe(self, this):
  356. """Make sure we are not trying to write an invalid character."""
  357. context = self._context
  358. if context & contexts.FAIL_NEXT:
  359. return False
  360. if context & contexts.WIKILINK_TITLE:
  361. if this == "]" or this == "{":
  362. self._context |= contexts.FAIL_NEXT
  363. elif this == "\n" or this == "[" or this == "}":
  364. return False
  365. return True
  366. if context & contexts.TEMPLATE_NAME:
  367. if this == "{" or this == "}" or this == "[":
  368. self._context |= contexts.FAIL_NEXT
  369. return True
  370. if this == "]":
  371. return False
  372. if this == "|":
  373. return True
  374. if context & contexts.HAS_TEXT:
  375. if context & contexts.FAIL_ON_TEXT:
  376. if this is self.END or not this.isspace():
  377. return False
  378. else:
  379. if this == "\n":
  380. self._context |= contexts.FAIL_ON_TEXT
  381. elif this is self.END or not this.isspace():
  382. self._context |= contexts.HAS_TEXT
  383. return True
  384. else:
  385. if context & contexts.FAIL_ON_EQUALS:
  386. if this == "=":
  387. return False
  388. elif context & contexts.FAIL_ON_LBRACE:
  389. if this == "{" or (self._read(-1) == self._read(-2) == "{"):
  390. if context & contexts.TEMPLATE:
  391. self._context |= contexts.FAIL_ON_EQUALS
  392. else:
  393. self._context |= contexts.FAIL_NEXT
  394. return True
  395. self._context ^= contexts.FAIL_ON_LBRACE
  396. elif context & contexts.FAIL_ON_RBRACE:
  397. if this == "}":
  398. if context & contexts.TEMPLATE:
  399. self._context |= contexts.FAIL_ON_EQUALS
  400. else:
  401. self._context |= contexts.FAIL_NEXT
  402. return True
  403. self._context ^= contexts.FAIL_ON_RBRACE
  404. elif this == "{":
  405. self._context |= contexts.FAIL_ON_LBRACE
  406. elif this == "}":
  407. self._context |= contexts.FAIL_ON_RBRACE
  408. return True
  409. def _parse(self, context=0):
  410. """Parse the wikicode string, using *context* for when to stop."""
  411. self._push(context)
  412. while True:
  413. this = self._read()
  414. unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
  415. contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME)
  416. if self._context & unsafe:
  417. if not self._verify_safe(this):
  418. if self._context & contexts.TEMPLATE_PARAM_KEY:
  419. self._pop()
  420. self._fail_route()
  421. if this not in self.MARKERS:
  422. self._write_text(this)
  423. self._head += 1
  424. continue
  425. if this is self.END:
  426. fail = (contexts.TEMPLATE | contexts.ARGUMENT |
  427. contexts.WIKILINK | contexts.HEADING |
  428. contexts.COMMENT)
  429. if self._context & contexts.TEMPLATE_PARAM_KEY:
  430. self._pop()
  431. if self._context & fail:
  432. self._fail_route()
  433. return self._pop()
  434. next = self._read(1)
  435. if self._context & contexts.COMMENT:
  436. if this == next == "-" and self._read(2) == ">":
  437. return self._pop()
  438. else:
  439. self._write_text(this)
  440. elif this == next == "{":
  441. if self._can_recurse():
  442. self._parse_template_or_argument()
  443. if self._context & contexts.FAIL_NEXT:
  444. self._context ^= contexts.FAIL_NEXT
  445. else:
  446. self._write_text("{")
  447. elif this == "|" and self._context & contexts.TEMPLATE:
  448. self._handle_template_param()
  449. elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
  450. self._handle_template_param_value()
  451. elif this == next == "}" and self._context & contexts.TEMPLATE:
  452. return self._handle_template_end()
  453. elif this == "|" and self._context & contexts.ARGUMENT_NAME:
  454. self._handle_argument_separator()
  455. elif this == next == "}" and self._context & contexts.ARGUMENT:
  456. if self._read(2) == "}":
  457. return self._handle_argument_end()
  458. else:
  459. self._write_text("}")
  460. elif this == next == "[":
  461. if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
  462. self._parse_wikilink()
  463. if self._context & contexts.FAIL_NEXT:
  464. self._context ^= contexts.FAIL_NEXT
  465. else:
  466. self._write_text("[")
  467. elif this == "|" and self._context & contexts.WIKILINK_TITLE:
  468. self._handle_wikilink_separator()
  469. elif this == next == "]" and self._context & contexts.WIKILINK:
  470. return self._handle_wikilink_end()
  471. elif this == "=" and not self._global & contexts.GL_HEADING:
  472. if self._read(-1) in ("\n", self.START):
  473. self._parse_heading()
  474. else:
  475. self._write_text("=")
  476. elif this == "=" and self._context & contexts.HEADING:
  477. return self._handle_heading_end()
  478. elif this == "\n" and self._context & contexts.HEADING:
  479. self._fail_route()
  480. elif this == "&":
  481. self._parse_entity()
  482. elif this == "<" and next == "!":
  483. if self._read(2) == self._read(3) == "-":
  484. self._parse_comment()
  485. else:
  486. self._write_text(this)
  487. else:
  488. self._write_text(this)
  489. self._head += 1
  490. def tokenize(self, text):
  491. """Build a list of tokens from a string of wikicode and return it."""
  492. split = self.regex.split(text)
  493. self._text = [segment for segment in split if segment]
  494. return self._parse()