A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

161 regels
5.3 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from . import contexts
  23. from . import tokens
  24. __all__ = ["Tokenizer"]
  25. class BadRoute(Exception):
  26. pass
  27. class Tokenizer(object):
  28. START = object()
  29. END = object()
  30. def __init__(self):
  31. self._text = None
  32. self._head = 0
  33. self._stacks = []
  34. @property
  35. def _stack(self):
  36. return self._stacks[-1][0]
  37. @property
  38. def _context(self):
  39. return self._stacks[-1][1]
  40. @_context.setter
  41. def _context(self, value):
  42. self._stacks[-1][1] = value
  43. def _push(self):
  44. stack, context = [], 0
  45. self._stacks.append([stack, context])
  46. def _pop(self):
  47. return self._stacks.pop()[0]
  48. def _write(self, token, stack=None):
  49. if stack is None:
  50. stack = self._stack
  51. if not stack:
  52. stack.append(token)
  53. return
  54. last = stack[-1]
  55. if isinstance(token, tokens.Text) and isinstance(last, tokens.Text):
  56. last.text += token.text
  57. else:
  58. stack.append(token)
  59. def _write_all(self, tokenlist, stack=None):
  60. if stack is None:
  61. stack = self._stack
  62. stack.extend(tokenlist)
  63. def _read(self, delta=0, wrap=False):
  64. index = self._head + delta
  65. if index < 0 and (not wrap or abs(index) > len(self._text)):
  66. return self.START
  67. if index >= len(self._text):
  68. return self.END
  69. return self._text[index]
  70. def _at_head(self, chars):
  71. return all([self._read(i) == chars[i] for i in xrange(len(chars))])
  72. def _verify_context_pre_stop(self):
  73. if self._read() is self.END:
  74. if self._context & contexts.TEMPLATE:
  75. raise BadRoute(self._pop())
  76. def _catch_stop(self, stop):
  77. if self._read() is self.END:
  78. return True
  79. try:
  80. iter(stop)
  81. except TypeError:
  82. if self._read() is stop:
  83. return True
  84. else:
  85. if all([self._read(i) == stop[i] for i in xrange(len(stop))]):
  86. self._head += len(stop) - 1
  87. return True
  88. return False
  89. def _verify_context_post_stop(self):
  90. if self._context & contexts.TEMPLATE_NAME and self._stack:
  91. head = self._stack[-1]
  92. if isinstance(head, tokens.Text):
  93. if head.text.strip() and head.text.endswith("\n"):
  94. if self._read() not in ["|", "=", "\n"]:
  95. raise BadRoute(self._pop())
  96. def _parse_template(self):
  97. reset = self._head
  98. self._head += 2
  99. try:
  100. template = self._parse_until("}}", contexts.TEMPLATE_NAME)
  101. except BadRoute:
  102. self._head = reset
  103. self._write(tokens.Text(text=self._read()))
  104. else:
  105. self._write(tokens.TemplateOpen())
  106. self._write_all(template)
  107. self._write(tokens.TemplateClose())
  108. def _handle_template_param(self):
  109. if self._context & contexts.TEMPLATE_NAME:
  110. self._context ^= contexts.TEMPLATE_NAME
  111. if self._context & contexts.TEMPLATE_PARAM_VALUE:
  112. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  113. self._context |= contexts.TEMPLATE_PARAM_KEY
  114. self._write(tokens.TemplateParamSeparator())
  115. def _handle_template_param_value(self):
  116. self._context ^= contexts.TEMPLATE_PARAM_KEY
  117. self._context |= contexts.TEMPLATE_PARAM_VALUE
  118. self._write(tokens.TemplateParamEquals())
  119. def _parse_until(self, stop, context=0):
  120. self._push()
  121. self._context = context
  122. while True:
  123. self._verify_context_pre_stop()
  124. if self._catch_stop(stop):
  125. return self._pop()
  126. self._verify_context_post_stop()
  127. if self._at_head("{{"):
  128. self._parse_template()
  129. elif self._at_head("|") and self._context & contexts.TEMPLATE:
  130. self._handle_template_param()
  131. elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY:
  132. self._handle_template_param_value()
  133. else:
  134. self._write(tokens.Text(text=self._read()))
  135. self._head += 1
  136. def tokenize(self, text):
  137. self._text = list(text)
  138. return self._parse_until(stop=self.END)