A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

299 lines
12 KiB

  1. #
  2. # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included in
  12. # all copies or substantial portions of the Software.
  13. #
  14. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20. # SOFTWARE.
  21. from . import tokens
  22. from .errors import ParserError
  23. from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
  24. Template, Text, Wikilink)
  25. from ..nodes.extras import Attribute, Parameter
  26. from ..smart_list import SmartList
  27. from ..wikicode import Wikicode
  28. __all__ = ["Builder"]
  29. _HANDLERS = {
  30. tokens.Text: lambda self, token: Text(token.text)
  31. }
  32. def _add_handler(token_type):
  33. """Create a decorator that adds a handler function to the lookup table."""
  34. def decorator(func):
  35. """Add a handler function to the lookup table."""
  36. _HANDLERS[token_type] = func
  37. return func
  38. return decorator
  39. class Builder:
  40. """Builds a tree of nodes out of a sequence of tokens.
  41. To use, pass a list of :class:`.Token`\\ s to the :meth:`build` method. The
  42. list will be exhausted as it is parsed and a :class:`.Wikicode` object
  43. containing the node tree will be returned.
  44. """
  45. def __init__(self):
  46. self._tokens = []
  47. self._stacks = []
  48. def _push(self):
  49. """Push a new node list onto the stack."""
  50. self._stacks.append([])
  51. def _pop(self):
  52. """Pop the current node list off of the stack.
  53. The raw node list is wrapped in a :class:`.SmartList` and then in a
  54. :class:`.Wikicode` object.
  55. """
  56. return Wikicode(SmartList(self._stacks.pop()))
  57. def _write(self, item):
  58. """Append a node to the current node list."""
  59. self._stacks[-1].append(item)
  60. def _handle_parameter(self, default):
  61. """Handle a case where a parameter is at the head of the tokens.
  62. *default* is the value to use if no parameter name is defined.
  63. """
  64. key = None
  65. showkey = False
  66. self._push()
  67. while self._tokens:
  68. token = self._tokens.pop()
  69. if isinstance(token, tokens.TemplateParamEquals):
  70. key = self._pop()
  71. showkey = True
  72. self._push()
  73. elif isinstance(token, (tokens.TemplateParamSeparator,
  74. tokens.TemplateClose)):
  75. self._tokens.append(token)
  76. value = self._pop()
  77. if key is None:
  78. key = Wikicode(SmartList([Text(str(default))]))
  79. return Parameter(key, value, showkey)
  80. else:
  81. self._write(self._handle_token(token))
  82. raise ParserError("_handle_parameter() missed a close token")
  83. @_add_handler(tokens.TemplateOpen)
  84. def _handle_template(self, token):
  85. """Handle a case where a template is at the head of the tokens."""
  86. params = []
  87. default = 1
  88. self._push()
  89. while self._tokens:
  90. token = self._tokens.pop()
  91. if isinstance(token, tokens.TemplateParamSeparator):
  92. if not params:
  93. name = self._pop()
  94. param = self._handle_parameter(default)
  95. params.append(param)
  96. if not param.showkey:
  97. default += 1
  98. elif isinstance(token, tokens.TemplateClose):
  99. if not params:
  100. name = self._pop()
  101. return Template(name, params)
  102. else:
  103. self._write(self._handle_token(token))
  104. raise ParserError("_handle_template() missed a close token")
  105. @_add_handler(tokens.ArgumentOpen)
  106. def _handle_argument(self, token):
  107. """Handle a case where an argument is at the head of the tokens."""
  108. name = None
  109. self._push()
  110. while self._tokens:
  111. token = self._tokens.pop()
  112. if isinstance(token, tokens.ArgumentSeparator):
  113. name = self._pop()
  114. self._push()
  115. elif isinstance(token, tokens.ArgumentClose):
  116. if name is not None:
  117. return Argument(name, self._pop())
  118. return Argument(self._pop())
  119. else:
  120. self._write(self._handle_token(token))
  121. raise ParserError("_handle_argument() missed a close token")
  122. @_add_handler(tokens.WikilinkOpen)
  123. def _handle_wikilink(self, token):
  124. """Handle a case where a wikilink is at the head of the tokens."""
  125. title = None
  126. self._push()
  127. while self._tokens:
  128. token = self._tokens.pop()
  129. if isinstance(token, tokens.WikilinkSeparator):
  130. title = self._pop()
  131. self._push()
  132. elif isinstance(token, tokens.WikilinkClose):
  133. if title is not None:
  134. return Wikilink(title, self._pop())
  135. return Wikilink(self._pop())
  136. else:
  137. self._write(self._handle_token(token))
  138. raise ParserError("_handle_wikilink() missed a close token")
  139. @_add_handler(tokens.ExternalLinkOpen)
  140. def _handle_external_link(self, token):
  141. """Handle when an external link is at the head of the tokens."""
  142. brackets, url = token.brackets, None
  143. self._push()
  144. while self._tokens:
  145. token = self._tokens.pop()
  146. if isinstance(token, tokens.ExternalLinkSeparator):
  147. url = self._pop()
  148. self._push()
  149. elif isinstance(token, tokens.ExternalLinkClose):
  150. if url is not None:
  151. return ExternalLink(url, self._pop(), brackets)
  152. return ExternalLink(self._pop(), brackets=brackets)
  153. else:
  154. self._write(self._handle_token(token))
  155. raise ParserError("_handle_external_link() missed a close token")
  156. @_add_handler(tokens.HTMLEntityStart)
  157. def _handle_entity(self, token):
  158. """Handle a case where an HTML entity is at the head of the tokens."""
  159. token = self._tokens.pop()
  160. if isinstance(token, tokens.HTMLEntityNumeric):
  161. token = self._tokens.pop()
  162. if isinstance(token, tokens.HTMLEntityHex):
  163. text = self._tokens.pop()
  164. self._tokens.pop() # Remove HTMLEntityEnd
  165. return HTMLEntity(text.text, named=False, hexadecimal=True,
  166. hex_char=token.char)
  167. self._tokens.pop() # Remove HTMLEntityEnd
  168. return HTMLEntity(token.text, named=False, hexadecimal=False)
  169. self._tokens.pop() # Remove HTMLEntityEnd
  170. return HTMLEntity(token.text, named=True, hexadecimal=False)
  171. @_add_handler(tokens.HeadingStart)
  172. def _handle_heading(self, token):
  173. """Handle a case where a heading is at the head of the tokens."""
  174. level = token.level
  175. self._push()
  176. while self._tokens:
  177. token = self._tokens.pop()
  178. if isinstance(token, tokens.HeadingEnd):
  179. title = self._pop()
  180. return Heading(title, level)
  181. self._write(self._handle_token(token))
  182. raise ParserError("_handle_heading() missed a close token")
  183. @_add_handler(tokens.CommentStart)
  184. def _handle_comment(self, token):
  185. """Handle a case where an HTML comment is at the head of the tokens."""
  186. self._push()
  187. while self._tokens:
  188. token = self._tokens.pop()
  189. if isinstance(token, tokens.CommentEnd):
  190. contents = self._pop()
  191. return Comment(contents)
  192. self._write(self._handle_token(token))
  193. raise ParserError("_handle_comment() missed a close token")
  194. def _handle_attribute(self, start):
  195. """Handle a case where a tag attribute is at the head of the tokens."""
  196. name = quotes = None
  197. self._push()
  198. while self._tokens:
  199. token = self._tokens.pop()
  200. if isinstance(token, tokens.TagAttrEquals):
  201. name = self._pop()
  202. self._push()
  203. elif isinstance(token, tokens.TagAttrQuote):
  204. quotes = token.char
  205. elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
  206. tokens.TagCloseSelfclose)):
  207. self._tokens.append(token)
  208. if name:
  209. value = self._pop()
  210. else:
  211. name, value = self._pop(), None
  212. return Attribute(name, value, quotes, start.pad_first,
  213. start.pad_before_eq, start.pad_after_eq)
  214. else:
  215. self._write(self._handle_token(token))
  216. raise ParserError("_handle_attribute() missed a close token")
  217. @_add_handler(tokens.TagOpenOpen)
  218. def _handle_tag(self, token):
  219. """Handle a case where a tag is at the head of the tokens."""
  220. close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
  221. implicit, attrs, contents, closing_tag = False, [], None, None
  222. wiki_markup, invalid = token.wiki_markup, token.invalid or False
  223. wiki_style_separator, closing_wiki_markup = None, wiki_markup
  224. self._push()
  225. while self._tokens:
  226. token = self._tokens.pop()
  227. if isinstance(token, tokens.TagAttrStart):
  228. attrs.append(self._handle_attribute(token))
  229. elif isinstance(token, tokens.TagCloseOpen):
  230. wiki_style_separator = token.wiki_markup
  231. padding = token.padding or ""
  232. tag = self._pop()
  233. self._push()
  234. elif isinstance(token, tokens.TagOpenClose):
  235. closing_wiki_markup = token.wiki_markup
  236. contents = self._pop()
  237. self._push()
  238. elif isinstance(token, close_tokens):
  239. if isinstance(token, tokens.TagCloseSelfclose):
  240. closing_wiki_markup = token.wiki_markup
  241. tag = self._pop()
  242. self_closing = True
  243. padding = token.padding or ""
  244. implicit = token.implicit or False
  245. else:
  246. self_closing = False
  247. closing_tag = self._pop()
  248. return Tag(tag, contents, attrs, wiki_markup, self_closing,
  249. invalid, implicit, padding, closing_tag,
  250. wiki_style_separator, closing_wiki_markup)
  251. else:
  252. self._write(self._handle_token(token))
  253. raise ParserError("_handle_tag() missed a close token")
  254. def _handle_token(self, token):
  255. """Handle a single token."""
  256. try:
  257. return _HANDLERS[type(token)](self, token)
  258. except KeyError:
  259. err = "_handle_token() got unexpected {0}"
  260. raise ParserError(err.format(type(token).__name__)) from None
  261. def build(self, tokenlist):
  262. """Build a Wikicode object from a list tokens and return it."""
  263. self._tokens = tokenlist
  264. self._tokens.reverse()
  265. self._push()
  266. while self._tokens:
  267. node = self._handle_token(self._tokens.pop())
  268. self._write(node)
  269. return self._pop()
  270. del _add_handler