A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 

207 行
7.1 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import htmlentitydefs
  23. import string
  24. from . import contexts
  25. from . import tokens
  26. __all__ = ["Tokenizer"]
  27. class BadRoute(Exception):
  28. pass
  29. class Tokenizer(object):
  30. START = object()
  31. END = object()
  32. def __init__(self):
  33. self._text = None
  34. self._head = 0
  35. self._stacks = []
  36. @property
  37. def _stack(self):
  38. return self._stacks[-1][0]
  39. @property
  40. def _context(self):
  41. return self._stacks[-1][1]
  42. @_context.setter
  43. def _context(self, value):
  44. self._stacks[-1][1] = value
  45. def _push(self, context=0):
  46. self._stacks.append([[], context])
  47. def _pop(self):
  48. return self._stacks.pop()[0]
  49. def _write(self, token, stack=None):
  50. if stack is None:
  51. stack = self._stack
  52. if not stack:
  53. stack.append(token)
  54. return
  55. last = stack[-1]
  56. if isinstance(token, tokens.Text) and isinstance(last, tokens.Text):
  57. last.text += token.text
  58. else:
  59. stack.append(token)
  60. def _write_all(self, tokenlist, stack=None):
  61. if stack is None:
  62. stack = self._stack
  63. stack.extend(tokenlist)
  64. def _read(self, delta=0, wrap=False):
  65. index = self._head + delta
  66. if index < 0 and (not wrap or abs(index) > len(self._text)):
  67. return self.START
  68. if index >= len(self._text):
  69. return self.END
  70. return self._text[index]
  71. def _at_head(self, chars):
  72. return all([self._read(i) == chars[i] for i in xrange(len(chars))])
  73. def _verify_context_pre_stop(self):
  74. if self._read() is self.END:
  75. if self._context & contexts.TEMPLATE:
  76. raise BadRoute(self._pop())
  77. def _catch_stop(self, stop):
  78. if self._read() is self.END:
  79. return True
  80. try:
  81. iter(stop)
  82. except TypeError:
  83. if self._read() is stop:
  84. return True
  85. else:
  86. if all([self._read(i) == stop[i] for i in xrange(len(stop))]):
  87. self._head += len(stop) - 1
  88. return True
  89. return False
  90. def _verify_context_post_stop(self):
  91. if self._context & contexts.TEMPLATE_NAME and self._stack:
  92. head = self._stack[-1]
  93. if isinstance(head, tokens.Text):
  94. if head.text.strip() and head.text.endswith("\n"):
  95. if self._read() not in ["|", "=", "\n"]:
  96. raise BadRoute(self._pop())
  97. def _parse_template(self):
  98. reset = self._head
  99. self._head += 2
  100. try:
  101. template = self._parse_until("}}", contexts.TEMPLATE_NAME)
  102. except BadRoute:
  103. self._head = reset
  104. self._write(tokens.Text(text=self._read()))
  105. else:
  106. self._write(tokens.TemplateOpen())
  107. self._write_all(template)
  108. self._write(tokens.TemplateClose())
  109. def _handle_template_param(self):
  110. if self._context & contexts.TEMPLATE_NAME:
  111. self._context ^= contexts.TEMPLATE_NAME
  112. if self._context & contexts.TEMPLATE_PARAM_VALUE:
  113. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  114. self._context |= contexts.TEMPLATE_PARAM_KEY
  115. self._write(tokens.TemplateParamSeparator())
  116. def _handle_template_param_value(self):
  117. self._context ^= contexts.TEMPLATE_PARAM_KEY
  118. self._context |= contexts.TEMPLATE_PARAM_VALUE
  119. self._write(tokens.TemplateParamEquals())
  120. def _parse_entity(self):
  121. reset = self._head
  122. self._head += 1
  123. try:
  124. self._push()
  125. self._write(tokens.HTMLEntityStart())
  126. numeric = hexadecimal = False
  127. if self._at_head("#"):
  128. numeric = True
  129. self._write(tokens.HTMLEntityNumeric())
  130. if self._read(1).lower() == "x":
  131. hexadecimal = True
  132. self._write(tokens.HTMLEntityHex(char=self._read(1)))
  133. self._head += 2
  134. else:
  135. self._head += 1
  136. text = []
  137. valid = string.hexdigits if hexadecimal else string.digits
  138. if not numeric and not hexadecimal:
  139. valid += string.ascii_letters
  140. while True:
  141. if self._at_head(";"):
  142. text = "".join(text)
  143. if numeric:
  144. test = int(text, 16) if hexadecimal else int(text)
  145. if test < 1 or test > 0x10FFFF:
  146. raise BadRoute(self._pop())
  147. else:
  148. if text not in htmlentitydefs.entitydefs:
  149. raise BadRoute(self._pop())
  150. self._write(tokens.Text(text=text))
  151. self._write(tokens.HTMLEntityEnd())
  152. break
  153. if self._read() is self.END or self._read() not in valid:
  154. raise BadRoute(self._pop())
  155. text.append(self._read())
  156. self._head += 1
  157. except BadRoute:
  158. self._head = reset
  159. self._write(tokens.Text(text=self._read()))
  160. else:
  161. self._write_all(self._pop())
  162. def _parse_until(self, stop, context=0):
  163. self._push(context)
  164. while True:
  165. self._verify_context_pre_stop()
  166. if self._catch_stop(stop):
  167. return self._pop()
  168. self._verify_context_post_stop()
  169. if self._at_head("{{"):
  170. self._parse_template()
  171. elif self._at_head("|") and self._context & contexts.TEMPLATE:
  172. self._handle_template_param()
  173. elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY:
  174. self._handle_template_param_value()
  175. elif self._at_head("&"):
  176. self._parse_entity()
  177. else:
  178. self._write(tokens.Text(text=self._read()))
  179. self._head += 1
  180. def tokenize(self, text):
  181. self._text = list(text)
  182. return self._parse_until(stop=self.END)