A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

169 regels
6.4 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from . import tokens
  24. from ..nodes import Heading, HTMLEntity, Tag, Template, Text
  25. from ..nodes.extras import Attribute, Parameter
  26. from ..smart_list import SmartList
  27. from ..wikicode import Wikicode
  28. __all__ = ["Builder"]
  29. class Builder(object):
  30. def __init__(self):
  31. self._tokens = []
  32. self._stacks = []
  33. def _push(self):
  34. self._stacks.append([])
  35. def _pop(self):
  36. return Wikicode(SmartList(self._stacks.pop()))
  37. def _write(self, item):
  38. self._stacks[-1].append(item)
  39. def _handle_parameter(self, key):
  40. showkey = False
  41. self._push()
  42. while self._tokens:
  43. token = self._tokens.pop(0)
  44. if isinstance(token, tokens.TemplateParamEquals):
  45. key = self._pop()
  46. showkey = True
  47. self._push()
  48. elif isinstance(token, (tokens.TemplateParamSeparator,
  49. tokens.TemplateClose)):
  50. self._tokens.insert(0, token)
  51. value = self._pop()
  52. return Parameter(key, value, showkey)
  53. else:
  54. self._write(self._handle_token())
  55. def _handle_template(self):
  56. params = []
  57. int_keys = set()
  58. int_key_range = {1}
  59. self._push()
  60. while self._tokens:
  61. token = self._tokens.pop(0)
  62. if isinstance(token, tokens.TemplateParamSeparator):
  63. if not params:
  64. name = self._pop()
  65. param = self._handle_parameter(min(int_key_range - int_keys))
  66. if re.match(r"[1-9][0-9]*$", param.name.strip()):
  67. int_keys.add(int(param.name))
  68. int_key_range.add(len(int_keys) + 1)
  69. params.append(param)
  70. elif isinstance(token, tokens.TemplateClose):
  71. if not params:
  72. name = self._pop()
  73. return Template(name, params)
  74. else:
  75. self._write(self._handle_token())
  76. def _handle_entity(self):
  77. token = self._tokens.pop(0)
  78. if isinstance(token, tokens.HTMLEntityNumeric):
  79. token = self._tokens.pop(0)
  80. if isinstance(token, tokens.HTMLEntityHex):
  81. token = self._tokens.pop(0)
  82. return HTMLEntity(token.text, named=False, hexadecimal=True)
  83. return HTMLEntity(token.text, named=False, hexadecimal=False)
  84. return HTMLEntity(token.text, named=True, hexadecimal=False)
  85. def _handle_heading(self, token):
  86. level = token.level
  87. self._push()
  88. while self._tokens:
  89. token = self._tokens.pop(0)
  90. if isinstance(token, tokens.HeadingBlock):
  91. title = self._pop()
  92. return Heading(title, level)
  93. else:
  94. self._write(self._handle_token())
  95. def _handle_attribute(self):
  96. name, quoted = None, False
  97. self._push()
  98. while self._tokens:
  99. token = self._tokens.pop(0)
  100. if isinstance(token, tokens.TagAttrEquals):
  101. name = self._pop()
  102. self._push()
  103. elif isinstance(token, tokens.TagAttrQuote):
  104. quoted = True
  105. elif isinstance(token, (tokens.TagAttrStart,
  106. tokens.TagCloseOpen)):
  107. self._tokens.insert(0, token)
  108. if name is not None:
  109. return Attribute(name, self._pop(), quoted)
  110. return Attribute(self._pop(), quoted=quoted)
  111. else:
  112. self._write(self._handle_token())
  113. def _handle_tag(self, token):
  114. type_, showtag = token.type, token.showtag
  115. attrs = []
  116. self._push()
  117. while self._tokens:
  118. token = self._tokens.pop(0)
  119. if isinstance(token, tokens.TagAttrStart):
  120. attrs.append(self._handle_attribute())
  121. elif isinstance(token, tokens.TagCloseOpen):
  122. open_pad = token.padding
  123. tag = self._pop()
  124. self._push()
  125. elif isinstance(token, tokens.TagCloseSelfclose):
  126. tag = self._pop()
  127. return Tag(type_, tag, attrs=attrs, showtag=showtag,
  128. self_closing=True, open_padding=token.padding)
  129. elif isinstance(token, tokens.TagOpenClose):
  130. contents = self._pop()
  131. elif isinstance(token, tokens.TagCloseClose):
  132. return Tag(type_, tag, contents, attrs, showtag, False,
  133. open_pad, token.padding)
  134. else:
  135. self._write(self._handle_token())
  136. def _handle_token(self):
  137. token = self._tokens.pop(0)
  138. if isinstance(token, tokens.Text):
  139. return Text(token.text)
  140. elif isinstance(token, tokens.TemplateOpen):
  141. return self._handle_template()
  142. elif isinstance(token, tokens.HTMLEntityStart):
  143. return self._handle_entity()
  144. elif isinstance(token, tokens.HeadingBlock):
  145. return self._handle_heading(token)
  146. elif isinstance(token, tokens.TagOpenOpen):
  147. return self._handle_tag(token)
  148. def build(self, tokenlist):
  149. self._tokens = tokenlist
  150. self._push()
  151. while self._tokens:
  152. self._write(self._handle_token())
  153. return self._pop()