A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

11 роки тому
12 роки тому
11 роки тому
12 роки тому
12 роки тому
12 роки тому
12 роки тому
12 роки тому
12 роки тому
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from math import log
  24. import re
  25. import string
  26. from . import contexts
  27. from . import tokens
  28. from ..compat import htmlentitydefs
  29. __all__ = ["Tokenizer"]
  30. class BadRoute(Exception):
  31. pass
  32. class Tokenizer(object):
  33. START = object()
  34. END = object()
  35. MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
  36. "/", "-", "\n", END]
  37. regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE)
  38. def __init__(self):
  39. self._text = None
  40. self._head = 0
  41. self._stacks = []
  42. self._global = 0
  43. @property
  44. def _stack(self):
  45. return self._stacks[-1][0]
  46. @property
  47. def _context(self):
  48. return self._stacks[-1][1]
  49. @_context.setter
  50. def _context(self, value):
  51. self._stacks[-1][1] = value
  52. @property
  53. def _textbuffer(self):
  54. return self._stacks[-1][2]
  55. @_textbuffer.setter
  56. def _textbuffer(self, value):
  57. self._stacks[-1][2] = value
  58. def _push(self, context=0):
  59. self._stacks.append([[], context, []])
  60. def _push_textbuffer(self):
  61. if self._textbuffer:
  62. self._stack.append(tokens.Text(text="".join(self._textbuffer)))
  63. self._textbuffer = []
  64. def _pop(self):
  65. self._push_textbuffer()
  66. return self._stacks.pop()[0]
  67. def _fail_route(self):
  68. self._pop()
  69. raise BadRoute()
  70. def _write(self, token):
  71. self._push_textbuffer()
  72. self._stack.append(token)
  73. def _write_text(self, text):
  74. self._textbuffer.append(text)
  75. def _write_all(self, tokenlist):
  76. if tokenlist and isinstance(tokenlist[0], tokens.Text):
  77. self._write_text(tokenlist.pop(0).text)
  78. self._push_textbuffer()
  79. self._stack.extend(tokenlist)
  80. def _read(self, delta=0, wrap=False, strict=False):
  81. index = self._head + delta
  82. if index < 0 and (not wrap or abs(index) > len(self._text)):
  83. return self.START
  84. try:
  85. return self._text[index]
  86. except IndexError:
  87. if strict:
  88. self._fail_route()
  89. return self.END
  90. def _parse_template(self):
  91. reset = self._head
  92. self._head += 2
  93. try:
  94. template = self._parse(contexts.TEMPLATE_NAME)
  95. except BadRoute:
  96. self._head = reset
  97. self._write_text(self._read())
  98. else:
  99. self._write(tokens.TemplateOpen())
  100. self._write_all(template)
  101. self._write(tokens.TemplateClose())
  102. def _verify_template_name(self):
  103. self._push_textbuffer()
  104. if self._stack:
  105. text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
  106. text = "".join([token.text for token in text])
  107. if text.strip() and "\n" in text.strip():
  108. self._fail_route()
  109. def _handle_template_param(self):
  110. if self._context & contexts.TEMPLATE_NAME:
  111. self._verify_template_name()
  112. self._context ^= contexts.TEMPLATE_NAME
  113. if self._context & contexts.TEMPLATE_PARAM_VALUE:
  114. self._context ^= contexts.TEMPLATE_PARAM_VALUE
  115. self._context |= contexts.TEMPLATE_PARAM_KEY
  116. self._write(tokens.TemplateParamSeparator())
  117. def _handle_template_param_value(self):
  118. self._context ^= contexts.TEMPLATE_PARAM_KEY
  119. self._context |= contexts.TEMPLATE_PARAM_VALUE
  120. self._write(tokens.TemplateParamEquals())
  121. def _handle_template_end(self):
  122. if self._context & contexts.TEMPLATE_NAME:
  123. self._verify_template_name()
  124. self._head += 1
  125. return self._pop()
  126. def _parse_heading(self):
  127. self._global |= contexts.GL_HEADING
  128. reset = self._head
  129. self._head += 1
  130. best = 1
  131. while self._read() == "=":
  132. best += 1
  133. self._head += 1
  134. context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
  135. try:
  136. title, level = self._parse(context)
  137. except BadRoute:
  138. self._head = reset + best - 1
  139. self._write_text("=" * best)
  140. else:
  141. self._write(tokens.HeadingStart(level=level))
  142. if level < best:
  143. self._write_text("=" * (best - level))
  144. self._write_all(title)
  145. self._write(tokens.HeadingEnd())
  146. finally:
  147. self._global ^= contexts.GL_HEADING
  148. def _handle_heading_end(self):
  149. reset = self._head
  150. self._head += 1
  151. best = 1
  152. while self._read() == "=":
  153. best += 1
  154. self._head += 1
  155. current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
  156. level = min(current, min(best, 6))
  157. try:
  158. after, after_level = self._parse(self._context)
  159. except BadRoute:
  160. if level < best:
  161. self._write_text("=" * (best - level))
  162. self._head = reset + best - 1
  163. return self._pop(), level
  164. else:
  165. self._write_text("=" * best)
  166. self._write_all(after)
  167. return self._pop(), after_level
  168. def _really_parse_entity(self):
  169. self._write(tokens.HTMLEntityStart())
  170. self._head += 1
  171. this = self._read(strict=True)
  172. if this == "#":
  173. numeric = True
  174. self._write(tokens.HTMLEntityNumeric())
  175. self._head += 1
  176. this = self._read(strict=True)
  177. if this[0].lower() == "x":
  178. hexadecimal = True
  179. self._write(tokens.HTMLEntityHex(char=this[0]))
  180. this = this[1:]
  181. if not this:
  182. self._fail_route()
  183. else:
  184. hexadecimal = False
  185. else:
  186. numeric = hexadecimal = False
  187. valid = string.hexdigits if hexadecimal else string.digits
  188. if not numeric and not hexadecimal:
  189. valid += string.ascii_letters
  190. if not all([char in valid for char in this]):
  191. self._fail_route()
  192. self._head += 1
  193. if self._read() != ";":
  194. self._fail_route()
  195. if numeric:
  196. test = int(this, 16) if hexadecimal else int(this)
  197. if test < 1 or test > 0x10FFFF:
  198. self._fail_route()
  199. else:
  200. if this not in htmlentitydefs.entitydefs:
  201. self._fail_route()
  202. self._write(tokens.Text(text=this))
  203. self._write(tokens.HTMLEntityEnd())
  204. def _parse_entity(self):
  205. reset = self._head
  206. self._push()
  207. try:
  208. self._really_parse_entity()
  209. except BadRoute:
  210. self._head = reset
  211. self._write_text(self._read())
  212. else:
  213. self._write_all(self._pop())
  214. def _parse(self, context=0):
  215. self._push(context)
  216. while True:
  217. this = self._read()
  218. if this not in self.MARKERS:
  219. self._write_text(this)
  220. self._head += 1
  221. continue
  222. if this is self.END:
  223. if self._context & (contexts.TEMPLATE | contexts.HEADING):
  224. self._fail_route()
  225. return self._pop()
  226. prev, next = self._read(-1), self._read(1)
  227. if this == next == "{":
  228. self._parse_template()
  229. elif this == "|" and self._context & contexts.TEMPLATE:
  230. self._handle_template_param()
  231. elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
  232. self._handle_template_param_value()
  233. elif this == next == "}" and self._context & contexts.TEMPLATE:
  234. return self._handle_template_end()
  235. elif (prev == "\n" or prev == self.START) and this == "=" and not self._global & contexts.GL_HEADING:
  236. self._parse_heading()
  237. elif this == "=" and self._context & contexts.HEADING:
  238. return self._handle_heading_end()
  239. elif this == "\n" and self._context & contexts.HEADING:
  240. self._fail_route()
  241. elif this == "&":
  242. self._parse_entity()
  243. else:
  244. self._write_text(this)
  245. self._head += 1
  246. def tokenize(self, text):
  247. split = self.regex.split(text)
  248. self._text = [segment for segment in split if segment]
  249. return self._parse()