A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

217 lines
7.8 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from collections import defaultdict
  24. import re
  25. from . import HTMLEntity, Node, Text
  26. from .extras import Parameter
  27. from ..compat import basestring, str
  28. from ..utils import parse_anything
  29. __all__ = ["Template"]
  30. FLAGS = re.DOTALL | re.UNICODE
  31. class Template(Node):
  32. def __init__(self, name, params=None):
  33. super(Template, self).__init__()
  34. self._name = name
  35. if params:
  36. self._params = params
  37. else:
  38. self._params = []
  39. def __unicode__(self):
  40. if self.params:
  41. params = "|".join([str(param) for param in self.params])
  42. return "{{" + str(self.name) + "|" + params + "}}"
  43. else:
  44. return "{{" + str(self.name) + "}}"
  45. def __iternodes__(self, getter):
  46. yield None, self
  47. for child in getter(self.name):
  48. yield self.name, child
  49. for param in self.params:
  50. if param.showkey:
  51. for child in getter(param.name):
  52. yield param.name, child
  53. for child in getter(param.value):
  54. yield param.value, child
  55. def __showtree__(self, write, get, mark):
  56. write("{{")
  57. get(self.name)
  58. for param in self.params:
  59. write(" | ")
  60. mark()
  61. get(param.name)
  62. write(" = ")
  63. mark()
  64. get(param.value)
  65. write("}}")
  66. def _surface_escape(self, code, char):
  67. replacement = HTMLEntity(value=ord(char))
  68. for node in code.filter_text(recursive=False):
  69. if char in node:
  70. code.replace(node, node.replace(char, replacement))
  71. def _blank_param_value(self, value):
  72. match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS)
  73. value.nodes = [Text(match.group(1)), Text(match.group(2))]
  74. def _select_theory(self, theories):
  75. if theories:
  76. best = max(theories.values())
  77. confidence = float(best) / sum(theories.values())
  78. if confidence > 0.75:
  79. keys = tuple(theories.keys())
  80. return keys[tuple(theories.values()).index(best)]
  81. def _get_spacing_conventions(self):
  82. before_theories = defaultdict(lambda: 0)
  83. after_theories = defaultdict(lambda: 0)
  84. for param in self.params:
  85. match = re.search(r"^(\s*).*?(\s*)$", str(param.value), FLAGS)
  86. before, after = match.group(1), match.group(2)
  87. before_theories[before] += 1
  88. after_theories[after] += 1
  89. before = self._select_theory(before_theories)
  90. after = self._select_theory(after_theories)
  91. return before, after
  92. def _remove_with_field(self, param, i, name):
  93. if param.showkey:
  94. following = self.params[i+1:]
  95. better_matches = [after.name.strip() == name and not after.showkey for after in following]
  96. if any(better_matches):
  97. return False
  98. return True
  99. def _remove_without_field(self, param, i, force_no_field):
  100. if not param.showkey and not force_no_field:
  101. dependents = [not after.showkey for after in self.params[i+1:]]
  102. if any(dependents):
  103. return False
  104. return True
  105. @property
  106. def name(self):
  107. return self._name
  108. @property
  109. def params(self):
  110. return self._params
  111. @name.setter
  112. def name(self, value):
  113. self._name = parse_anything(value)
  114. def has_param(self, name, ignore_empty=True):
  115. name = name.strip() if isinstance(name, basestring) else str(name)
  116. for param in self.params:
  117. if param.name.strip() == name:
  118. if ignore_empty and not param.value.strip():
  119. continue
  120. return True
  121. return False
  122. def get(self, name):
  123. name = name.strip() if isinstance(name, basestring) else str(name)
  124. for param in reversed(self.params):
  125. if param.name.strip() == name:
  126. return param
  127. raise ValueError(name)
  128. def add(self, name, value, showkey=None, force_nonconformity=False):
  129. name, value = parse_anything(name), parse_anything(value)
  130. self._surface_escape(value, "|")
  131. if self.has_param(name):
  132. self.remove(name, keep_field=True)
  133. existing = self.get(name)
  134. if showkey is not None:
  135. if not showkey:
  136. self._surface_escape(value, "=")
  137. existing.showkey = showkey
  138. nodes = existing.value.nodes
  139. if force_nonconformity:
  140. existing.value = value
  141. else:
  142. existing.value = parse_anything([nodes[0], value, nodes[1]])
  143. return existing
  144. if showkey is None:
  145. try:
  146. int_name = int(str(name))
  147. except ValueError:
  148. showkey = True
  149. else:
  150. int_keys = set()
  151. for param in self.params:
  152. if not param.showkey:
  153. if re.match(r"[1-9][0-9]*$", param.name.strip()):
  154. int_keys.add(int(str(param.name)))
  155. expected = min(set(range(1, len(int_keys) + 2)) - int_keys)
  156. if expected == int_name:
  157. showkey = False
  158. else:
  159. showkey = True
  160. if not showkey:
  161. self._surface_escape(value, "=")
  162. if not force_nonconformity:
  163. before, after = self._get_spacing_conventions()
  164. if before and after:
  165. value = parse_anything([before, value, after])
  166. elif before:
  167. value = parse_anything([before, value])
  168. elif after:
  169. value = parse_anything([value, after])
  170. param = Parameter(name, value, showkey)
  171. self.params.append(param)
  172. return param
  173. def remove(self, name, keep_field=False, force_no_field=False):
  174. name = name.strip() if isinstance(name, basestring) else str(name)
  175. removed = False
  176. for i, param in enumerate(self.params):
  177. if param.name.strip() == name:
  178. if keep_field:
  179. if self._remove_with_field(param, i, name):
  180. self._blank_param_value(param.value)
  181. keep_field = False
  182. else:
  183. self.params.remove(param)
  184. else:
  185. if self._remove_without_field(param, i, force_no_field):
  186. self.params.remove(param)
  187. else:
  188. self._blank_param_value(param.value)
  189. if not removed:
  190. removed = True
  191. if not removed:
  192. raise ValueError(name)