A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
 
 
 
 

120 строки
3.8 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from . import contexts
  23. from . import tokens
  24. __all__ = ["Tokenizer"]
  25. class BadRoute(Exception):
  26. pass
  27. class Tokenizer(object):
  28. START = object()
  29. END = object()
  30. def __init__(self):
  31. self._text = None
  32. self._head = 0
  33. self._stacks = []
  34. self._context = 0
  35. def _push(self):
  36. self._stacks.append([])
  37. def _pop(self):
  38. return self._stacks.pop()
  39. def _write(self, token, stack=None):
  40. if stack is None:
  41. stack = self._stacks[-1]
  42. if not stack:
  43. stack.append(token)
  44. return
  45. last = stack[-1]
  46. if isinstance(token, tokens.Text) and isinstance(last, tokens.Text):
  47. last.text += token.text
  48. else:
  49. stack.append(token)
  50. def _read(self, delta=0, wrap=False):
  51. index = self._head + delta
  52. if index < 0 and (not wrap or abs(index) > len(self._text)):
  53. return self.START
  54. if index >= len(self._text):
  55. return self.END
  56. return self._text[index]
  57. def _verify_context(self):
  58. if self._read() is self.END:
  59. if self._context & contexts.INSIDE_TEMPLATE:
  60. raise BadRoute()
  61. def _catch_stop(self, stop):
  62. if self._read() is self.END:
  63. return True
  64. try:
  65. iter(stop)
  66. except TypeError:
  67. if self._read() is stop:
  68. return True
  69. else:
  70. if all([self._read(i) == stop[i] for i in xrange(len(stop))]):
  71. self._head += len(stop) - 1
  72. return True
  73. return False
  74. def _parse_template(self):
  75. reset = self._head
  76. self._head += 2
  77. self._context |= contexts.TEMPLATE_NAME
  78. try:
  79. template = self._parse_until("}}")
  80. except BadRoute:
  81. self._head = reset
  82. self._write(tokens.Text(text=self._read()))
  83. else:
  84. self._write(tokens.TemplateOpen())
  85. self._stacks[-1] += template
  86. self._write(tokens.TemplateClose())
  87. ending = (contexts.TEMPLATE_NAME, contexts.TEMPLATE_PARAM_KEY,
  88. contexts.TEMPLATE_PARAM_VALUE)
  89. for context in ending:
  90. self._context ^= context if self._context & context else 0
  91. def _parse_until(self, stop):
  92. self._push()
  93. while True:
  94. self._verify_context()
  95. if self._catch_stop(stop):
  96. return self._pop()
  97. if self._read(0) == "{" and self._read(1) == "{":
  98. self._parse_template()
  99. else:
  100. self._write(tokens.Text(text=self._read()))
  101. self._head += 1
  102. def tokenize(self, text):
  103. self._text = list(text)
  104. return self._parse_until(stop=self.END)