A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 
 

158 řádky
6.4 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from . import tokens
  24. from .build_stack import BuildStack
  25. from ..nodes import Heading, HTMLEntity, Tag, Template, Text
  26. from ..nodes.extras import Attribute, Parameter
  27. __all__ = ["Builder"]
  28. class Builder(object):
  29. def __init__(self):
  30. self._tokens = []
  31. self._stack = BuildStack()
  32. def _handle_parameter(self, key):
  33. showkey = False
  34. self._stack.push()
  35. while self._tokens:
  36. token = self._tokens.pop(0)
  37. if isinstance(token, tokens.TEMPLATE_PARAM_EQUALS):
  38. key = self._stack.pop()
  39. showkey = True
  40. self._stack.push()
  41. elif isinstance(token, (tokens.TEMPLATE_PARAM_SEPARATOR,
  42. tokens.TEMPLATE_CLOSE)):
  43. self._tokens.insert(0, token)
  44. value = self._stack.pop()
  45. return Parameter(key, value, showkey)
  46. else:
  47. self._stack.write(self._handle_token())
  48. def _handle_template(self):
  49. params = []
  50. int_keys = set()
  51. int_key_range = {1}
  52. self._stack.push()
  53. while self._tokens:
  54. token = self._tokens.pop(0)
  55. if isinstance(token, tokens.TEMPLATE_PARAM_SEPARATOR):
  56. if not params:
  57. name = self._stack.pop()
  58. param = self._handle_parameter(min(int_key_range - int_keys))
  59. if re.match(r"[1-9][0-9]*$", param.key.strip()):
  60. int_keys.add(int(param.key))
  61. int_key_range.add(len(int_keys) + 1)
  62. params.append(param)
  63. elif isinstance(token, tokens.TEMPLATE_CLOSE):
  64. if not params:
  65. name = self._stack.pop()
  66. return Template(name, params)
  67. else:
  68. self._stack.write(self._handle_token())
  69. def _handle_entity(self):
  70. token = self._tokens.pop(0)
  71. if isinstance(token, tokens.HTML_ENTITY_NUMERIC):
  72. token = self._tokens.pop(0)
  73. if isinstance(token, tokens.HTML_ENTITY_HEX):
  74. token = self._tokens.pop(0)
  75. return HTMLEntity(token.text, named=False, hexadecimal=True)
  76. return HTMLEntity(token.text, named=False, hexadecimal=False)
  77. return HTMLEntity(token.text, named=True, hexadecimal=False)
  78. def _handle_heading(self, token):
  79. level = token.level
  80. self._stack.push()
  81. while self._tokens:
  82. token = self._tokens.pop(0)
  83. if isinstance(token, tokens.HEADING_BLOCK):
  84. title = self._stack.pop()
  85. return Heading(title, level)
  86. else:
  87. self._stack.write(self._handle_token())
  88. def _handle_attribute(self):
  89. name, quoted = None, False
  90. self._stack.push()
  91. while self._tokens:
  92. token = self._tokens.pop(0)
  93. if isinstance(token, tokens.TAG_ATTR_EQUALS):
  94. name = self._stack.pop()
  95. self._stack.push()
  96. elif isinstance(token, tokens.TAG_ATTR_QUOTE):
  97. quoted = True
  98. elif isinstance(token, (tokens.TAG_ATTR_START,
  99. tokens.TAG_CLOSE_OPEN)):
  100. self._tokens.insert(0, token)
  101. if name is not None:
  102. return Attribute(name, self._stack.pop(), quoted)
  103. return Attribute(self._stack.pop(), quoted=quoted)
  104. else:
  105. self._stack.write(self._handle_token())
  106. def _handle_tag(self, token):
  107. type_, showtag, attrs = token.type, token.showtag, attrs
  108. self._stack.push()
  109. while self._tokens:
  110. token = self._tokens.pop(0)
  111. if isinstance(token, tokens.TAG_ATTR_START):
  112. attrs.append(self._handle_attribute())
  113. elif isinstance(token, tokens.TAG_CLOSE_OPEN):
  114. open_pad = token.padding
  115. tag = self._stack.pop()
  116. self._stack.push()
  117. elif isinstance(token, tokens.TAG_CLOSE_SELFCLOSE):
  118. tag = self._stack.pop()
  119. return Tag(type_, tag, attrs=attrs, showtag=showtag,
  120. self_closing=True, open_padding=token.padding)
  121. elif isinstance(token, tokens.TAG_OPEN_CLOSE):
  122. contents = self._stack.pop()
  123. elif isinstance(token, tokens.TAG_CLOSE_CLOSE):
  124. return Tag(type_, tag, contents, attrs, showtag, self_closing,
  125. open_pad, token.padding)
  126. else:
  127. self._stack.write(self._handle_token())
  128. def _handle_token(self):
  129. token = self._tokens.pop(0)
  130. if isinstance(token, tokens.TEXT):
  131. return Text(token.text)
  132. elif isinstance(token, tokens.TEMPLATE_OPEN):
  133. return self._handle_template()
  134. elif isinstance(token, tokens.HTML_ENTITY_START):
  135. return self._handle_entity()
  136. elif isinstance(token, tokens.HEADING_BLOCK):
  137. return self._handle_heading(token)
  138. elif isinstance(token, tokens.TAG_OPEN_OPEN):
  139. return self._handle_tag(token)
  140. def build(self, tokens):
  141. self._tokens = tokens
  142. self._stack.push()
  143. while self._tokens:
  144. self._stack.write(self._handle_token())
  145. return self._stack.pop()