A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

331 lines
13 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2014 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from collections import defaultdict
  24. import re
  25. from . import HTMLEntity, Node, Text
  26. from .extras import Parameter
  27. from ..compat import range, str
  28. from ..utils import parse_anything
  29. __all__ = ["Template"]
  30. FLAGS = re.DOTALL | re.UNICODE
  31. class Template(Node):
  32. """Represents a template in wikicode, like ``{{foo}}``."""
  33. def __init__(self, name, params=None):
  34. super(Template, self).__init__()
  35. self._name = name
  36. if params:
  37. self._params = params
  38. else:
  39. self._params = []
  40. def __unicode__(self):
  41. if self.params:
  42. params = "|".join([str(param) for param in self.params])
  43. return "{{" + str(self.name) + "|" + params + "}}"
  44. else:
  45. return "{{" + str(self.name) + "}}"
  46. def __children__(self):
  47. yield self.name
  48. for param in self.params:
  49. if param.showkey:
  50. yield param.name
  51. yield param.value
  52. def __showtree__(self, write, get, mark):
  53. write("{{")
  54. get(self.name)
  55. for param in self.params:
  56. write(" | ")
  57. mark()
  58. get(param.name)
  59. write(" = ")
  60. mark()
  61. get(param.value)
  62. write("}}")
  63. def _surface_escape(self, code, char):
  64. """Return *code* with *char* escaped as an HTML entity.
  65. The main use of this is to escape pipes (``|``) or equal signs (``=``)
  66. in parameter names or values so they are not mistaken for new
  67. parameters.
  68. """
  69. replacement = str(HTMLEntity(value=ord(char)))
  70. for node in code.filter_text(recursive=False):
  71. if char in node:
  72. code.replace(node, node.replace(char, replacement), False)
  73. def _blank_param_value(self, value):
  74. """Remove the content from *value* while keeping its whitespace.
  75. Replace *value*\ 's nodes with two text nodes, the first containing
  76. whitespace from before its content and the second containing whitespace
  77. from after its content.
  78. """
  79. match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS)
  80. value.nodes = [Text(match.group(1)), Text(match.group(2))]
  81. def _select_theory(self, theories):
  82. """Return the most likely spacing convention given different options.
  83. Given a dictionary of convention options as keys and their occurrence as
  84. values, return the convention that occurs the most, or ``None`` if
  85. there is no clear preferred style.
  86. """
  87. if theories:
  88. values = tuple(theories.values())
  89. best = max(values)
  90. confidence = float(best) / sum(values)
  91. if confidence >= 0.75:
  92. return tuple(theories.keys())[values.index(best)]
  93. def _get_spacing_conventions(self, use_names):
  94. """Try to determine the whitespace conventions for parameters.
  95. This will examine the existing parameters and use
  96. :py:meth:`_select_theory` to determine if there are any preferred
  97. styles for how much whitespace to put before or after the value.
  98. """
  99. before_theories = defaultdict(lambda: 0)
  100. after_theories = defaultdict(lambda: 0)
  101. for param in self.params:
  102. if use_names:
  103. component = str(param.name)
  104. else:
  105. component = str(param.value)
  106. match = re.search(r"^(\s*).*?(\s*)$", component, FLAGS)
  107. before, after = match.group(1), match.group(2)
  108. before_theories[before] += 1
  109. after_theories[after] += 1
  110. before = self._select_theory(before_theories)
  111. after = self._select_theory(after_theories)
  112. return before, after
  113. def _remove_with_field(self, param, i, name):
  114. """Return True if a parameter name should be kept, otherwise False."""
  115. if param.showkey:
  116. following = self.params[i+1:]
  117. better_matches = [after.name.strip() == name and not after.showkey for after in following]
  118. if any(better_matches):
  119. return False
  120. return True
  121. def _remove_without_field(self, param, i):
  122. """Return False if a parameter name should be kept, otherwise True."""
  123. if not param.showkey:
  124. dependents = [not after.showkey for after in self.params[i+1:]]
  125. if any(dependents):
  126. return False
  127. return True
  128. def _remove_exact(self, needle, keep_field):
  129. """Remove a specific parameter, *needle*, from the template."""
  130. for i, param in enumerate(self.params):
  131. if param is needle:
  132. if keep_field or not self._remove_without_field(param, i):
  133. self._blank_param_value(param.value)
  134. else:
  135. self.params.pop(i)
  136. return
  137. raise ValueError(needle)
  138. @property
  139. def name(self):
  140. """The name of the template, as a :py:class:`~.Wikicode` object."""
  141. return self._name
  142. @property
  143. def params(self):
  144. """The list of parameters contained within the template."""
  145. return self._params
  146. @name.setter
  147. def name(self, value):
  148. self._name = parse_anything(value)
  149. def has(self, name, ignore_empty=False):
  150. """Return ``True`` if any parameter in the template is named *name*.
  151. With *ignore_empty*, ``False`` will be returned even if the template
  152. contains a parameter with the name *name*, if the parameter's value
  153. is empty. Note that a template may have multiple parameters with the
  154. same name, but only the last one is read by the MediaWiki parser.
  155. """
  156. name = str(name).strip()
  157. for param in self.params:
  158. if param.name.strip() == name:
  159. if ignore_empty and not param.value.strip():
  160. continue
  161. return True
  162. return False
  163. has_param = lambda self, name, ignore_empty=False: \
  164. self.has(name, ignore_empty)
  165. has_param.__doc__ = "Alias for :py:meth:`has`."
  166. def get(self, name):
  167. """Get the parameter whose name is *name*.
  168. The returned object is a :py:class:`~.Parameter` instance. Raises
  169. :py:exc:`ValueError` if no parameter has this name. Since multiple
  170. parameters can have the same name, we'll return the last match, since
  171. the last parameter is the only one read by the MediaWiki parser.
  172. """
  173. name = str(name).strip()
  174. for param in reversed(self.params):
  175. if param.name.strip() == name:
  176. return param
  177. raise ValueError(name)
  178. def add(self, name, value, showkey=None, before=None,
  179. preserve_spacing=True):
  180. """Add a parameter to the template with a given *name* and *value*.
  181. *name* and *value* can be anything parsable by
  182. :py:func:`.utils.parse_anything`; pipes and equal signs are
  183. automatically escaped from *value* when appropriate.
  184. If *showkey* is given, this will determine whether or not to show the
  185. parameter's name (e.g., ``{{foo|bar}}``'s parameter has a name of
  186. ``"1"`` but it is hidden); otherwise, we'll make a safe and intelligent
  187. guess.
  188. If *name* is already a parameter in the template, we'll replace its
  189. value while keeping the same whitespace around it. We will also try to
  190. guess the dominant spacing convention when adding a new parameter using
  191. :py:meth:`_get_spacing_conventions`.
  192. If *before* is given (either a :py:class:`~.Parameter` object or a
  193. name), then we will place the parameter immediately before this one.
  194. Otherwise, it will be added at the end. If *before* is a name and
  195. exists multiple times in the template, we will place it before the last
  196. occurrence. If *before* is not in the template, :py:exc:`ValueError` is
  197. raised. The argument is ignored if the new parameter already exists.
  198. If *preserve_spacing* is ``False``, we will avoid preserving spacing
  199. conventions when changing the value of an existing parameter or when
  200. adding a new one.
  201. """
  202. name, value = parse_anything(name), parse_anything(value)
  203. self._surface_escape(value, "|")
  204. if self.has(name):
  205. self.remove(name, keep_field=True)
  206. existing = self.get(name)
  207. if showkey is not None:
  208. existing.showkey = showkey
  209. if not existing.showkey:
  210. self._surface_escape(value, "=")
  211. nodes = existing.value.nodes
  212. if preserve_spacing:
  213. for i in range(2): # Ignore empty text nodes
  214. if not nodes[i]:
  215. nodes[i] = None
  216. existing.value = parse_anything([nodes[0], value, nodes[1]])
  217. else:
  218. existing.value = value
  219. return existing
  220. if showkey is None:
  221. if Parameter.can_hide_key(name):
  222. int_name = int(str(name))
  223. int_keys = set()
  224. for param in self.params:
  225. if not param.showkey:
  226. int_keys.add(int(str(param.name)))
  227. expected = min(set(range(1, len(int_keys) + 2)) - int_keys)
  228. if expected == int_name:
  229. showkey = False
  230. else:
  231. showkey = True
  232. else:
  233. showkey = True
  234. if not showkey:
  235. self._surface_escape(value, "=")
  236. if preserve_spacing:
  237. before_n, after_n = self._get_spacing_conventions(use_names=True)
  238. before_v, after_v = self._get_spacing_conventions(use_names=False)
  239. name = parse_anything([before_n, name, after_n])
  240. value = parse_anything([before_v, value, after_v])
  241. param = Parameter(name, value, showkey)
  242. if before:
  243. if not isinstance(before, Parameter):
  244. before = self.get(before)
  245. self.params.insert(self.params.index(before), param)
  246. else:
  247. self.params.append(param)
  248. return param
  249. def remove(self, param, keep_field=False):
  250. """Remove a parameter from the template, identified by *param*.
  251. If *param* is a :py:class:`.Parameter` object, it will be matched
  252. exactly, otherwise it will be treated like the *name* argument to
  253. :py:meth:`has` and :py:meth:`get`.
  254. If *keep_field* is ``True``, we will keep the parameter's name, but
  255. blank its value. Otherwise, we will remove the parameter completely
  256. *unless* other parameters are dependent on it (e.g. removing ``bar``
  257. from ``{{foo|bar|baz}}`` is unsafe because ``{{foo|baz}}`` is not what
  258. we expected, so ``{{foo||baz}}`` will be produced instead).
  259. If the parameter shows up multiple times in the template and *param* is
  260. not a :py:class:`.Parameter` object, we will remove all instances of it
  261. (and keep only one if *keep_field* is ``True`` - the first instance if
  262. none have dependents, otherwise the one with dependents will be kept).
  263. """
  264. if isinstance(param, Parameter):
  265. return self._remove_exact(param, keep_field)
  266. name = str(param).strip()
  267. removed = False
  268. to_remove = []
  269. for i, param in enumerate(self.params):
  270. if param.name.strip() == name:
  271. if keep_field:
  272. if self._remove_with_field(param, i, name):
  273. self._blank_param_value(param.value)
  274. keep_field = False
  275. else:
  276. to_remove.append(i)
  277. else:
  278. if self._remove_without_field(param, i):
  279. to_remove.append(i)
  280. else:
  281. self._blank_param_value(param.value)
  282. if not removed:
  283. removed = True
  284. if not removed:
  285. raise ValueError(name)
  286. for i in reversed(to_remove):
  287. self.params.pop(i)