A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

338 lines
13 KiB

  1. # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. from . import tokens
  21. from .errors import ParserError
  22. from ..nodes import (
  23. Argument,
  24. Comment,
  25. ExternalLink,
  26. Heading,
  27. HTMLEntity,
  28. Tag,
  29. Template,
  30. Text,
  31. Wikilink,
  32. )
  33. from ..nodes.extras import Attribute, Parameter
  34. from ..smart_list import SmartList
  35. from ..wikicode import Wikicode
  36. __all__ = ["Builder"]
  37. _HANDLERS = {tokens.Text: lambda self, token: Text(token.text)}
  38. def _add_handler(token_type):
  39. """Create a decorator that adds a handler function to the lookup table."""
  40. def decorator(func):
  41. """Add a handler function to the lookup table."""
  42. _HANDLERS[token_type] = func
  43. return func
  44. return decorator
  45. class Builder:
  46. """Builds a tree of nodes out of a sequence of tokens.
  47. To use, pass a list of :class:`.Token`\\ s to the :meth:`build` method. The
  48. list will be exhausted as it is parsed and a :class:`.Wikicode` object
  49. containing the node tree will be returned.
  50. """
  51. def __init__(self):
  52. self._tokens = []
  53. self._stacks = []
  54. def _push(self):
  55. """Push a new node list onto the stack."""
  56. self._stacks.append([])
  57. def _pop(self):
  58. """Pop the current node list off of the stack.
  59. The raw node list is wrapped in a :class:`.SmartList` and then in a
  60. :class:`.Wikicode` object.
  61. """
  62. return Wikicode(SmartList(self._stacks.pop()))
  63. def _write(self, item):
  64. """Append a node to the current node list."""
  65. self._stacks[-1].append(item)
  66. def _handle_parameter(self, default):
  67. """Handle a case where a parameter is at the head of the tokens.
  68. *default* is the value to use if no parameter name is defined.
  69. """
  70. key = None
  71. showkey = False
  72. self._push()
  73. while self._tokens:
  74. token = self._tokens.pop()
  75. if isinstance(token, tokens.TemplateParamEquals):
  76. key = self._pop()
  77. showkey = True
  78. self._push()
  79. elif isinstance(
  80. token, (tokens.TemplateParamSeparator, tokens.TemplateClose)
  81. ):
  82. self._tokens.append(token)
  83. value = self._pop()
  84. if key is None:
  85. key = Wikicode(SmartList([Text(str(default))]))
  86. return Parameter(key, value, showkey)
  87. else:
  88. self._write(self._handle_token(token))
  89. raise ParserError("_handle_parameter() missed a close token")
  90. @_add_handler(tokens.TemplateOpen)
  91. def _handle_template(self, token):
  92. """Handle a case where a template is at the head of the tokens."""
  93. params = []
  94. default = 1
  95. self._push()
  96. while self._tokens:
  97. token = self._tokens.pop()
  98. if isinstance(token, tokens.TemplateParamSeparator):
  99. if not params:
  100. name = self._pop()
  101. param = self._handle_parameter(default)
  102. params.append(param)
  103. if not param.showkey:
  104. default += 1
  105. elif isinstance(token, tokens.TemplateClose):
  106. if not params:
  107. name = self._pop()
  108. return Template(name, params)
  109. else:
  110. self._write(self._handle_token(token))
  111. raise ParserError("_handle_template() missed a close token")
  112. @_add_handler(tokens.ArgumentOpen)
  113. def _handle_argument(self, token):
  114. """Handle a case where an argument is at the head of the tokens."""
  115. name = None
  116. self._push()
  117. while self._tokens:
  118. token = self._tokens.pop()
  119. if isinstance(token, tokens.ArgumentSeparator):
  120. name = self._pop()
  121. self._push()
  122. elif isinstance(token, tokens.ArgumentClose):
  123. if name is not None:
  124. return Argument(name, self._pop())
  125. return Argument(self._pop())
  126. else:
  127. self._write(self._handle_token(token))
  128. raise ParserError("_handle_argument() missed a close token")
  129. @_add_handler(tokens.WikilinkOpen)
  130. def _handle_wikilink(self, token):
  131. """Handle a case where a wikilink is at the head of the tokens."""
  132. title = None
  133. self._push()
  134. while self._tokens:
  135. token = self._tokens.pop()
  136. if isinstance(token, tokens.WikilinkSeparator):
  137. title = self._pop()
  138. self._push()
  139. elif isinstance(token, tokens.WikilinkClose):
  140. if title is not None:
  141. return Wikilink(title, self._pop())
  142. return Wikilink(self._pop())
  143. else:
  144. self._write(self._handle_token(token))
  145. raise ParserError("_handle_wikilink() missed a close token")
  146. @_add_handler(tokens.ExternalLinkOpen)
  147. def _handle_external_link(self, token):
  148. """Handle when an external link is at the head of the tokens."""
  149. brackets, url, suppress_space = token.brackets, None, None
  150. self._push()
  151. while self._tokens:
  152. token = self._tokens.pop()
  153. if isinstance(token, tokens.ExternalLinkSeparator):
  154. url = self._pop()
  155. suppress_space = token.suppress_space
  156. self._push()
  157. elif isinstance(token, tokens.ExternalLinkClose):
  158. if url is not None:
  159. return ExternalLink(
  160. url,
  161. self._pop(),
  162. brackets=brackets,
  163. suppress_space=suppress_space is True,
  164. )
  165. return ExternalLink(
  166. self._pop(),
  167. brackets=brackets,
  168. suppress_space=suppress_space is True,
  169. )
  170. else:
  171. self._write(self._handle_token(token))
  172. raise ParserError("_handle_external_link() missed a close token")
  173. @_add_handler(tokens.HTMLEntityStart)
  174. def _handle_entity(self, token):
  175. """Handle a case where an HTML entity is at the head of the tokens."""
  176. token = self._tokens.pop()
  177. if isinstance(token, tokens.HTMLEntityNumeric):
  178. token = self._tokens.pop()
  179. if isinstance(token, tokens.HTMLEntityHex):
  180. text = self._tokens.pop()
  181. self._tokens.pop() # Remove HTMLEntityEnd
  182. return HTMLEntity(
  183. text.text, named=False, hexadecimal=True, hex_char=token.char
  184. )
  185. self._tokens.pop() # Remove HTMLEntityEnd
  186. return HTMLEntity(token.text, named=False, hexadecimal=False)
  187. self._tokens.pop() # Remove HTMLEntityEnd
  188. return HTMLEntity(token.text, named=True, hexadecimal=False)
  189. @_add_handler(tokens.HeadingStart)
  190. def _handle_heading(self, token):
  191. """Handle a case where a heading is at the head of the tokens."""
  192. level = token.level
  193. self._push()
  194. while self._tokens:
  195. token = self._tokens.pop()
  196. if isinstance(token, tokens.HeadingEnd):
  197. title = self._pop()
  198. return Heading(title, level)
  199. self._write(self._handle_token(token))
  200. raise ParserError("_handle_heading() missed a close token")
  201. @_add_handler(tokens.CommentStart)
  202. def _handle_comment(self, token):
  203. """Handle a case where an HTML comment is at the head of the tokens."""
  204. self._push()
  205. while self._tokens:
  206. token = self._tokens.pop()
  207. if isinstance(token, tokens.CommentEnd):
  208. contents = self._pop()
  209. return Comment(contents)
  210. self._write(self._handle_token(token))
  211. raise ParserError("_handle_comment() missed a close token")
  212. def _handle_attribute(self, start):
  213. """Handle a case where a tag attribute is at the head of the tokens."""
  214. name = quotes = None
  215. self._push()
  216. while self._tokens:
  217. token = self._tokens.pop()
  218. if isinstance(token, tokens.TagAttrEquals):
  219. name = self._pop()
  220. self._push()
  221. elif isinstance(token, tokens.TagAttrQuote):
  222. quotes = token.char
  223. elif isinstance(
  224. token,
  225. (tokens.TagAttrStart, tokens.TagCloseOpen, tokens.TagCloseSelfclose),
  226. ):
  227. self._tokens.append(token)
  228. if name:
  229. value = self._pop()
  230. else:
  231. name, value = self._pop(), None
  232. return Attribute(
  233. name,
  234. value,
  235. quotes,
  236. start.pad_first,
  237. start.pad_before_eq,
  238. start.pad_after_eq,
  239. )
  240. else:
  241. self._write(self._handle_token(token))
  242. raise ParserError("_handle_attribute() missed a close token")
  243. @_add_handler(tokens.TagOpenOpen)
  244. def _handle_tag(self, token):
  245. """Handle a case where a tag is at the head of the tokens."""
  246. close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
  247. implicit, attrs, contents, closing_tag = False, [], None, None
  248. wiki_markup, invalid = token.wiki_markup, token.invalid or False
  249. wiki_style_separator, closing_wiki_markup = None, wiki_markup
  250. self._push()
  251. while self._tokens:
  252. token = self._tokens.pop()
  253. if isinstance(token, tokens.TagAttrStart):
  254. attrs.append(self._handle_attribute(token))
  255. elif isinstance(token, tokens.TagCloseOpen):
  256. wiki_style_separator = token.wiki_markup
  257. padding = token.padding or ""
  258. tag = self._pop()
  259. self._push()
  260. elif isinstance(token, tokens.TagOpenClose):
  261. closing_wiki_markup = token.wiki_markup
  262. contents = self._pop()
  263. self._push()
  264. elif isinstance(token, close_tokens):
  265. if isinstance(token, tokens.TagCloseSelfclose):
  266. closing_wiki_markup = token.wiki_markup
  267. tag = self._pop()
  268. self_closing = True
  269. padding = token.padding or ""
  270. implicit = token.implicit or False
  271. else:
  272. self_closing = False
  273. closing_tag = self._pop()
  274. return Tag(
  275. tag,
  276. contents,
  277. attrs,
  278. wiki_markup,
  279. self_closing,
  280. invalid,
  281. implicit,
  282. padding,
  283. closing_tag,
  284. wiki_style_separator,
  285. closing_wiki_markup,
  286. )
  287. else:
  288. self._write(self._handle_token(token))
  289. raise ParserError("_handle_tag() missed a close token")
  290. def _handle_token(self, token):
  291. """Handle a single token."""
  292. try:
  293. return _HANDLERS[type(token)](self, token)
  294. except KeyError:
  295. err = "_handle_token() got unexpected {0}"
  296. raise ParserError(err.format(type(token).__name__)) from None
  297. def build(self, tokenlist):
  298. """Build a Wikicode object from a list tokens and return it."""
  299. self._tokens = tokenlist
  300. self._tokens.reverse()
  301. self._push()
  302. while self._tokens:
  303. node = self._handle_token(self._tokens.pop())
  304. self._write(node)
  305. return self._pop()
  306. del _add_handler