A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

298 lines
12 KiB

  1. # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. from . import tokens
  21. from .errors import ParserError
  22. from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
  23. Template, Text, Wikilink)
  24. from ..nodes.extras import Attribute, Parameter
  25. from ..smart_list import SmartList
  26. from ..wikicode import Wikicode
  27. __all__ = ["Builder"]
  28. _HANDLERS = {
  29. tokens.Text: lambda self, token: Text(token.text)
  30. }
  31. def _add_handler(token_type):
  32. """Create a decorator that adds a handler function to the lookup table."""
  33. def decorator(func):
  34. """Add a handler function to the lookup table."""
  35. _HANDLERS[token_type] = func
  36. return func
  37. return decorator
  38. class Builder:
  39. """Builds a tree of nodes out of a sequence of tokens.
  40. To use, pass a list of :class:`.Token`\\ s to the :meth:`build` method. The
  41. list will be exhausted as it is parsed and a :class:`.Wikicode` object
  42. containing the node tree will be returned.
  43. """
  44. def __init__(self):
  45. self._tokens = []
  46. self._stacks = []
  47. def _push(self):
  48. """Push a new node list onto the stack."""
  49. self._stacks.append([])
  50. def _pop(self):
  51. """Pop the current node list off of the stack.
  52. The raw node list is wrapped in a :class:`.SmartList` and then in a
  53. :class:`.Wikicode` object.
  54. """
  55. return Wikicode(SmartList(self._stacks.pop()))
  56. def _write(self, item):
  57. """Append a node to the current node list."""
  58. self._stacks[-1].append(item)
  59. def _handle_parameter(self, default):
  60. """Handle a case where a parameter is at the head of the tokens.
  61. *default* is the value to use if no parameter name is defined.
  62. """
  63. key = None
  64. showkey = False
  65. self._push()
  66. while self._tokens:
  67. token = self._tokens.pop()
  68. if isinstance(token, tokens.TemplateParamEquals):
  69. key = self._pop()
  70. showkey = True
  71. self._push()
  72. elif isinstance(token, (tokens.TemplateParamSeparator,
  73. tokens.TemplateClose)):
  74. self._tokens.append(token)
  75. value = self._pop()
  76. if key is None:
  77. key = Wikicode(SmartList([Text(str(default))]))
  78. return Parameter(key, value, showkey)
  79. else:
  80. self._write(self._handle_token(token))
  81. raise ParserError("_handle_parameter() missed a close token")
  82. @_add_handler(tokens.TemplateOpen)
  83. def _handle_template(self, token):
  84. """Handle a case where a template is at the head of the tokens."""
  85. params = []
  86. default = 1
  87. self._push()
  88. while self._tokens:
  89. token = self._tokens.pop()
  90. if isinstance(token, tokens.TemplateParamSeparator):
  91. if not params:
  92. name = self._pop()
  93. param = self._handle_parameter(default)
  94. params.append(param)
  95. if not param.showkey:
  96. default += 1
  97. elif isinstance(token, tokens.TemplateClose):
  98. if not params:
  99. name = self._pop()
  100. return Template(name, params)
  101. else:
  102. self._write(self._handle_token(token))
  103. raise ParserError("_handle_template() missed a close token")
  104. @_add_handler(tokens.ArgumentOpen)
  105. def _handle_argument(self, token):
  106. """Handle a case where an argument is at the head of the tokens."""
  107. name = None
  108. self._push()
  109. while self._tokens:
  110. token = self._tokens.pop()
  111. if isinstance(token, tokens.ArgumentSeparator):
  112. name = self._pop()
  113. self._push()
  114. elif isinstance(token, tokens.ArgumentClose):
  115. if name is not None:
  116. return Argument(name, self._pop())
  117. return Argument(self._pop())
  118. else:
  119. self._write(self._handle_token(token))
  120. raise ParserError("_handle_argument() missed a close token")
  121. @_add_handler(tokens.WikilinkOpen)
  122. def _handle_wikilink(self, token):
  123. """Handle a case where a wikilink is at the head of the tokens."""
  124. title = None
  125. self._push()
  126. while self._tokens:
  127. token = self._tokens.pop()
  128. if isinstance(token, tokens.WikilinkSeparator):
  129. title = self._pop()
  130. self._push()
  131. elif isinstance(token, tokens.WikilinkClose):
  132. if title is not None:
  133. return Wikilink(title, self._pop())
  134. return Wikilink(self._pop())
  135. else:
  136. self._write(self._handle_token(token))
  137. raise ParserError("_handle_wikilink() missed a close token")
  138. @_add_handler(tokens.ExternalLinkOpen)
  139. def _handle_external_link(self, token):
  140. """Handle when an external link is at the head of the tokens."""
  141. brackets, url = token.brackets, None
  142. self._push()
  143. while self._tokens:
  144. token = self._tokens.pop()
  145. if isinstance(token, tokens.ExternalLinkSeparator):
  146. url = self._pop()
  147. self._push()
  148. elif isinstance(token, tokens.ExternalLinkClose):
  149. if url is not None:
  150. return ExternalLink(url, self._pop(), brackets)
  151. return ExternalLink(self._pop(), brackets=brackets)
  152. else:
  153. self._write(self._handle_token(token))
  154. raise ParserError("_handle_external_link() missed a close token")
  155. @_add_handler(tokens.HTMLEntityStart)
  156. def _handle_entity(self, token):
  157. """Handle a case where an HTML entity is at the head of the tokens."""
  158. token = self._tokens.pop()
  159. if isinstance(token, tokens.HTMLEntityNumeric):
  160. token = self._tokens.pop()
  161. if isinstance(token, tokens.HTMLEntityHex):
  162. text = self._tokens.pop()
  163. self._tokens.pop() # Remove HTMLEntityEnd
  164. return HTMLEntity(text.text, named=False, hexadecimal=True,
  165. hex_char=token.char)
  166. self._tokens.pop() # Remove HTMLEntityEnd
  167. return HTMLEntity(token.text, named=False, hexadecimal=False)
  168. self._tokens.pop() # Remove HTMLEntityEnd
  169. return HTMLEntity(token.text, named=True, hexadecimal=False)
  170. @_add_handler(tokens.HeadingStart)
  171. def _handle_heading(self, token):
  172. """Handle a case where a heading is at the head of the tokens."""
  173. level = token.level
  174. self._push()
  175. while self._tokens:
  176. token = self._tokens.pop()
  177. if isinstance(token, tokens.HeadingEnd):
  178. title = self._pop()
  179. return Heading(title, level)
  180. self._write(self._handle_token(token))
  181. raise ParserError("_handle_heading() missed a close token")
  182. @_add_handler(tokens.CommentStart)
  183. def _handle_comment(self, token):
  184. """Handle a case where an HTML comment is at the head of the tokens."""
  185. self._push()
  186. while self._tokens:
  187. token = self._tokens.pop()
  188. if isinstance(token, tokens.CommentEnd):
  189. contents = self._pop()
  190. return Comment(contents)
  191. self._write(self._handle_token(token))
  192. raise ParserError("_handle_comment() missed a close token")
  193. def _handle_attribute(self, start):
  194. """Handle a case where a tag attribute is at the head of the tokens."""
  195. name = quotes = None
  196. self._push()
  197. while self._tokens:
  198. token = self._tokens.pop()
  199. if isinstance(token, tokens.TagAttrEquals):
  200. name = self._pop()
  201. self._push()
  202. elif isinstance(token, tokens.TagAttrQuote):
  203. quotes = token.char
  204. elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
  205. tokens.TagCloseSelfclose)):
  206. self._tokens.append(token)
  207. if name:
  208. value = self._pop()
  209. else:
  210. name, value = self._pop(), None
  211. return Attribute(name, value, quotes, start.pad_first,
  212. start.pad_before_eq, start.pad_after_eq)
  213. else:
  214. self._write(self._handle_token(token))
  215. raise ParserError("_handle_attribute() missed a close token")
  216. @_add_handler(tokens.TagOpenOpen)
  217. def _handle_tag(self, token):
  218. """Handle a case where a tag is at the head of the tokens."""
  219. close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
  220. implicit, attrs, contents, closing_tag = False, [], None, None
  221. wiki_markup, invalid = token.wiki_markup, token.invalid or False
  222. wiki_style_separator, closing_wiki_markup = None, wiki_markup
  223. self._push()
  224. while self._tokens:
  225. token = self._tokens.pop()
  226. if isinstance(token, tokens.TagAttrStart):
  227. attrs.append(self._handle_attribute(token))
  228. elif isinstance(token, tokens.TagCloseOpen):
  229. wiki_style_separator = token.wiki_markup
  230. padding = token.padding or ""
  231. tag = self._pop()
  232. self._push()
  233. elif isinstance(token, tokens.TagOpenClose):
  234. closing_wiki_markup = token.wiki_markup
  235. contents = self._pop()
  236. self._push()
  237. elif isinstance(token, close_tokens):
  238. if isinstance(token, tokens.TagCloseSelfclose):
  239. closing_wiki_markup = token.wiki_markup
  240. tag = self._pop()
  241. self_closing = True
  242. padding = token.padding or ""
  243. implicit = token.implicit or False
  244. else:
  245. self_closing = False
  246. closing_tag = self._pop()
  247. return Tag(tag, contents, attrs, wiki_markup, self_closing,
  248. invalid, implicit, padding, closing_tag,
  249. wiki_style_separator, closing_wiki_markup)
  250. else:
  251. self._write(self._handle_token(token))
  252. raise ParserError("_handle_tag() missed a close token")
  253. def _handle_token(self, token):
  254. """Handle a single token."""
  255. try:
  256. return _HANDLERS[type(token)](self, token)
  257. except KeyError:
  258. err = "_handle_token() got unexpected {0}"
  259. raise ParserError(err.format(type(token).__name__)) from None
  260. def build(self, tokenlist):
  261. """Build a Wikicode object from a list tokens and return it."""
  262. self._tokens = tokenlist
  263. self._tokens.reverse()
  264. self._push()
  265. while self._tokens:
  266. node = self._handle_token(self._tokens.pop())
  267. self._write(node)
  268. return self._pop()
  269. del _add_handler