A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

283 lines
11 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from . import tokens
  24. from ..compat import str
  25. from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
  26. Template, Text, Wikilink)
  27. from ..nodes.extras import Attribute, Parameter
  28. from ..smart_list import SmartList
  29. from ..wikicode import Wikicode
  30. __all__ = ["Builder"]
  31. class Builder(object):
  32. """Combines a sequence of tokens into a tree of ``Wikicode`` objects.
  33. To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build`
  34. method. The list will be exhausted as it is parsed and a
  35. :py:class:`~.Wikicode` object will be returned.
  36. """
  37. def __init__(self):
  38. self._tokens = []
  39. self._stacks = []
  40. def _wrap(self, nodes):
  41. """Properly wrap a list of nodes in a ``Wikicode`` object."""
  42. return Wikicode(SmartList(nodes))
  43. def _push(self):
  44. """Push a new node list onto the stack."""
  45. self._stacks.append([])
  46. def _pop(self, wrap=True):
  47. """Pop the current node list off of the stack.
  48. If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list.
  49. """
  50. if wrap:
  51. return self._wrap(self._stacks.pop())
  52. return self._stacks.pop()
  53. def _write(self, item):
  54. """Append a node to the current node list."""
  55. self._stacks[-1].append(item)
  56. def _handle_parameter(self, default):
  57. """Handle a case where a parameter is at the head of the tokens.
  58. *default* is the value to use if no parameter name is defined.
  59. """
  60. key = None
  61. showkey = False
  62. self._push()
  63. while self._tokens:
  64. token = self._tokens.pop()
  65. if isinstance(token, tokens.TemplateParamEquals):
  66. key = self._pop()
  67. showkey = True
  68. self._push()
  69. elif isinstance(token, (tokens.TemplateParamSeparator,
  70. tokens.TemplateClose)):
  71. self._tokens.append(token)
  72. value = self._pop()
  73. if key is None:
  74. key = self._wrap([Text(str(default))])
  75. return Parameter(key, value, showkey)
  76. else:
  77. self._write(self._handle_token(token))
  78. def _handle_template(self):
  79. """Handle a case where a template is at the head of the tokens."""
  80. params = []
  81. default = 1
  82. self._push()
  83. while self._tokens:
  84. token = self._tokens.pop()
  85. if isinstance(token, tokens.TemplateParamSeparator):
  86. if not params:
  87. name = self._pop()
  88. param = self._handle_parameter(default)
  89. params.append(param)
  90. if not param.showkey:
  91. default += 1
  92. elif isinstance(token, tokens.TemplateClose):
  93. if not params:
  94. name = self._pop()
  95. return Template(name, params)
  96. else:
  97. self._write(self._handle_token(token))
  98. def _handle_argument(self):
  99. """Handle a case where an argument is at the head of the tokens."""
  100. name = None
  101. self._push()
  102. while self._tokens:
  103. token = self._tokens.pop()
  104. if isinstance(token, tokens.ArgumentSeparator):
  105. name = self._pop()
  106. self._push()
  107. elif isinstance(token, tokens.ArgumentClose):
  108. if name is not None:
  109. return Argument(name, self._pop())
  110. return Argument(self._pop())
  111. else:
  112. self._write(self._handle_token(token))
  113. def _handle_wikilink(self):
  114. """Handle a case where a wikilink is at the head of the tokens."""
  115. title = None
  116. self._push()
  117. while self._tokens:
  118. token = self._tokens.pop()
  119. if isinstance(token, tokens.WikilinkSeparator):
  120. title = self._pop()
  121. self._push()
  122. elif isinstance(token, tokens.WikilinkClose):
  123. if title is not None:
  124. return Wikilink(title, self._pop())
  125. return Wikilink(self._pop())
  126. else:
  127. self._write(self._handle_token(token))
  128. def _handle_entity(self):
  129. """Handle a case where an HTML entity is at the head of the tokens."""
  130. token = self._tokens.pop()
  131. if isinstance(token, tokens.HTMLEntityNumeric):
  132. token = self._tokens.pop()
  133. if isinstance(token, tokens.HTMLEntityHex):
  134. text = self._tokens.pop()
  135. self._tokens.pop() # Remove HTMLEntityEnd
  136. return HTMLEntity(text.text, named=False, hexadecimal=True,
  137. hex_char=token.char)
  138. self._tokens.pop() # Remove HTMLEntityEnd
  139. return HTMLEntity(token.text, named=False, hexadecimal=False)
  140. self._tokens.pop() # Remove HTMLEntityEnd
  141. return HTMLEntity(token.text, named=True, hexadecimal=False)
  142. def _handle_heading(self, token):
  143. """Handle a case where a heading is at the head of the tokens."""
  144. level = token.level
  145. self._push()
  146. while self._tokens:
  147. token = self._tokens.pop()
  148. if isinstance(token, tokens.HeadingEnd):
  149. title = self._pop()
  150. return Heading(title, level)
  151. else:
  152. self._write(self._handle_token(token))
  153. def _handle_comment(self):
  154. """Handle a case where an HTML comment is at the head of the tokens."""
  155. self._push()
  156. while self._tokens:
  157. token = self._tokens.pop()
  158. if isinstance(token, tokens.CommentEnd):
  159. contents = self._pop()
  160. return Comment(contents)
  161. else:
  162. self._write(self._handle_token(token))
  163. def _handle_attribute(self, start):
  164. """Handle a case where a tag attribute is at the head of the tokens."""
  165. name, quoted = None, False
  166. self._push()
  167. while self._tokens:
  168. token = self._tokens.pop()
  169. if isinstance(token, tokens.TagAttrEquals):
  170. name = self._pop()
  171. self._push()
  172. elif isinstance(token, tokens.TagAttrQuote):
  173. quoted = True
  174. elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
  175. tokens.TagCloseSelfclose)):
  176. self._tokens.append(token)
  177. if name:
  178. value = self._pop()
  179. else:
  180. name, value = self._pop(), None
  181. return Attribute(name, value, quoted, start.pad_first,
  182. start.pad_before_eq, start.pad_after_eq)
  183. else:
  184. self._write(self._handle_token(token))
  185. def _handle_tag(self, token):
  186. """Handle a case where a tag is at the head of the tokens."""
  187. close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
  188. implicit, attrs, contents, closing_tag = False, [], None, None
  189. wiki_markup, invalid = token.wiki_markup, token.invalid or False
  190. self._push()
  191. while self._tokens:
  192. token = self._tokens.pop()
  193. if isinstance(token, tokens.TagAttrStart):
  194. attrs.append(self._handle_attribute(token))
  195. elif isinstance(token, tokens.TagCloseOpen):
  196. padding = token.padding or ""
  197. tag = self._pop()
  198. self._push()
  199. elif isinstance(token, tokens.TagOpenClose):
  200. contents = self._pop()
  201. self._push()
  202. elif isinstance(token, close_tokens):
  203. if isinstance(token, tokens.TagCloseSelfclose):
  204. tag = self._pop()
  205. self_closing = True
  206. padding = token.padding or ""
  207. implicit = token.implicit or False
  208. else:
  209. self_closing = False
  210. closing_tag = self._pop()
  211. return Tag(tag, contents, attrs, wiki_markup, self_closing,
  212. invalid, implicit, padding, closing_tag)
  213. else:
  214. self._write(self._handle_token(token))
  215. def _handle_external_link(self, token):
  216. """Handle when an external link is at the head of the tokens."""
  217. brackets, url = token.brackets, None
  218. self._push()
  219. while self._tokens:
  220. token = self._tokens.pop()
  221. if isinstance(token, tokens.ExternalLinkSeparator):
  222. url = self._pop()
  223. self._push()
  224. elif isinstance(token, tokens.ExternalLinkClose):
  225. if url is not None:
  226. return ExternalLink(url, self._pop(), brackets)
  227. return ExternalLink(self._pop(), brackets=brackets)
  228. else:
  229. self._write(self._handle_token(token))
  230. def _handle_token(self, token):
  231. """Handle a single token."""
  232. if isinstance(token, tokens.Text):
  233. return Text(token.text)
  234. elif isinstance(token, tokens.TemplateOpen):
  235. return self._handle_template()
  236. elif isinstance(token, tokens.ArgumentOpen):
  237. return self._handle_argument()
  238. elif isinstance(token, tokens.WikilinkOpen):
  239. return self._handle_wikilink()
  240. elif isinstance(token, tokens.HTMLEntityStart):
  241. return self._handle_entity()
  242. elif isinstance(token, tokens.HeadingStart):
  243. return self._handle_heading(token)
  244. elif isinstance(token, tokens.CommentStart):
  245. return self._handle_comment()
  246. elif isinstance(token, tokens.TagOpenOpen):
  247. return self._handle_tag(token)
  248. elif isinstance(token, tokens.ExternalLinkOpen):
  249. return self._handle_external_link(token)
  250. def build(self, tokenlist):
  251. """Build a Wikicode object from a list tokens and return it."""
  252. self._tokens = tokenlist
  253. self._tokens.reverse()
  254. self._push()
  255. while self._tokens:
  256. node = self._handle_token(self._tokens.pop())
  257. self._write(node)
  258. return self._pop()