A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

368 lines
14 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from collections import defaultdict
  24. import re
  25. from . import HTMLEntity, Node, Text
  26. from .extras import Parameter
  27. from ..compat import range, str
  28. from ..utils import parse_anything
  29. __all__ = ["Template"]
  30. FLAGS = re.DOTALL | re.UNICODE
  31. TEMPLATES = {
  32. "Esp": lambda x: f"* 10^{x.params[0]}",
  33. "smallcaps": lambda x: f"{x.params[0]}",
  34. "Unicode": lambda x: f"{x.params[0]}",
  35. "IPA": lambda x: f"{x.params[0]}",
  36. "transl": lambda x: f"{x.params[-1]}",
  37. "IAST": lambda x: f"{x.params[0]}",
  38. "ssub": lambda x: f"{x.params[0]}",
  39. "SubatomicParticle": lambda x: f"{x.params[0]}",
  40. "convert": lambda x: f"{x.params[0]} {x.params[1]}",
  41. }
  42. class Template(Node):
  43. """Represents a template in wikicode, like ``{{foo}}``."""
  44. def __init__(self, name, params=None):
  45. super(Template, self).__init__()
  46. self.name = name
  47. if params:
  48. self._params = params
  49. else:
  50. self._params = []
  51. def __unicode__(self):
  52. if self.params:
  53. params = "|".join([str(param) for param in self.params])
  54. return "{{" + str(self.name) + "|" + params + "}}"
  55. else:
  56. return "{{" + str(self.name) + "}}"
  57. def __children__(self):
  58. yield self.name
  59. for param in self.params:
  60. if param.showkey:
  61. yield param.name
  62. yield param.value
  63. def __strip__(self, **kwargs):
  64. if str(self.name) in TEMPLATES:
  65. return TEMPLATES[str(self.name)](self)
  66. return None
  67. def __showtree__(self, write, get, mark):
  68. write("{{")
  69. get(self.name)
  70. for param in self.params:
  71. write(" | ")
  72. mark()
  73. get(param.name)
  74. write(" = ")
  75. mark()
  76. get(param.value)
  77. write("}}")
  78. @staticmethod
  79. def _surface_escape(code, char):
  80. """Return *code* with *char* escaped as an HTML entity.
  81. The main use of this is to escape pipes (``|``) or equal signs (``=``)
  82. in parameter names or values so they are not mistaken for new
  83. parameters.
  84. """
  85. replacement = str(HTMLEntity(value=ord(char)))
  86. for node in code.filter_text(recursive=False):
  87. if char in node:
  88. code.replace(node, node.replace(char, replacement), False)
  89. @staticmethod
  90. def _select_theory(theories):
  91. """Return the most likely spacing convention given different options.
  92. Given a dictionary of convention options as keys and their occurrence
  93. as values, return the convention that occurs the most, or ``None`` if
  94. there is no clear preferred style.
  95. """
  96. if theories:
  97. values = tuple(theories.values())
  98. best = max(values)
  99. confidence = float(best) / sum(values)
  100. if confidence > 0.5:
  101. return tuple(theories.keys())[values.index(best)]
  102. @staticmethod
  103. def _blank_param_value(value):
  104. """Remove the content from *value* while keeping its whitespace.
  105. Replace *value*\\ 's nodes with two text nodes, the first containing
  106. whitespace from before its content and the second containing whitespace
  107. from after its content.
  108. """
  109. sval = str(value)
  110. if sval.isspace():
  111. before, after = "", sval
  112. else:
  113. match = re.search(r"^(\s*).*?(\s*)$", sval, FLAGS)
  114. before, after = match.group(1), match.group(2)
  115. value.nodes = [Text(before), Text(after)]
  116. def _get_spacing_conventions(self, use_names):
  117. """Try to determine the whitespace conventions for parameters.
  118. This will examine the existing parameters and use
  119. :meth:`_select_theory` to determine if there are any preferred styles
  120. for how much whitespace to put before or after the value.
  121. """
  122. before_theories = defaultdict(lambda: 0)
  123. after_theories = defaultdict(lambda: 0)
  124. for param in self.params:
  125. if not param.showkey:
  126. continue
  127. if use_names:
  128. component = str(param.name)
  129. else:
  130. component = str(param.value)
  131. match = re.search(r"^(\s*).*?(\s*)$", component, FLAGS)
  132. before, after = match.group(1), match.group(2)
  133. if not use_names and component.isspace() and "\n" in before:
  134. # If the value is empty, we expect newlines in the whitespace
  135. # to be after the content, not before it:
  136. before, after = before.split("\n", 1)
  137. after = "\n" + after
  138. before_theories[before] += 1
  139. after_theories[after] += 1
  140. before = self._select_theory(before_theories)
  141. after = self._select_theory(after_theories)
  142. return before, after
  143. def _fix_dependendent_params(self, i):
  144. """Unhide keys if necessary after removing the param at index *i*."""
  145. if not self.params[i].showkey:
  146. for param in self.params[i + 1:]:
  147. if not param.showkey:
  148. param.showkey = True
  149. def _remove_exact(self, needle, keep_field):
  150. """Remove a specific parameter, *needle*, from the template."""
  151. for i, param in enumerate(self.params):
  152. if param is needle:
  153. if keep_field:
  154. self._blank_param_value(param.value)
  155. else:
  156. self._fix_dependendent_params(i)
  157. self.params.pop(i)
  158. return
  159. raise ValueError(needle)
  160. def _should_remove(self, i, name):
  161. """Look ahead for a parameter with the same name, but hidden.
  162. If one exists, we should remove the given one rather than blanking it.
  163. """
  164. if self.params[i].showkey:
  165. following = self.params[i + 1:]
  166. better_matches = [after.name.strip() == name and not after.showkey
  167. for after in following]
  168. return any(better_matches)
  169. return False
  170. @property
  171. def name(self):
  172. """The name of the template, as a :class:`.Wikicode` object."""
  173. return self._name
  174. @property
  175. def params(self):
  176. """The list of parameters contained within the template."""
  177. return self._params
  178. @name.setter
  179. def name(self, value):
  180. self._name = parse_anything(value)
  181. def has(self, name, ignore_empty=False):
  182. """Return ``True`` if any parameter in the template is named *name*.
  183. With *ignore_empty*, ``False`` will be returned even if the template
  184. contains a parameter with the name *name*, if the parameter's value
  185. is empty. Note that a template may have multiple parameters with the
  186. same name, but only the last one is read by the MediaWiki parser.
  187. """
  188. name = str(name).strip()
  189. for param in self.params:
  190. if param.name.strip() == name:
  191. if ignore_empty and not param.value.strip():
  192. continue
  193. return True
  194. return False
  195. has_param = lambda self, name, ignore_empty=False: \
  196. self.has(name, ignore_empty)
  197. has_param.__doc__ = "Alias for :meth:`has`."
  198. def get(self, name):
  199. """Get the parameter whose name is *name*.
  200. The returned object is a :class:`.Parameter` instance. Raises
  201. :exc:`ValueError` if no parameter has this name. Since multiple
  202. parameters can have the same name, we'll return the last match, since
  203. the last parameter is the only one read by the MediaWiki parser.
  204. """
  205. name = str(name).strip()
  206. for param in reversed(self.params):
  207. if param.name.strip() == name:
  208. return param
  209. raise ValueError(name)
  210. def add(self, name, value, showkey=None, before=None,
  211. preserve_spacing=True):
  212. """Add a parameter to the template with a given *name* and *value*.
  213. *name* and *value* can be anything parsable by
  214. :func:`.utils.parse_anything`; pipes and equal signs are automatically
  215. escaped from *value* when appropriate.
  216. If *name* is already a parameter in the template, we'll replace its
  217. value.
  218. If *showkey* is given, this will determine whether or not to show the
  219. parameter's name (e.g., ``{{foo|bar}}``'s parameter has a name of
  220. ``"1"`` but it is hidden); otherwise, we'll make a safe and intelligent
  221. guess.
  222. If *before* is given (either a :class:`.Parameter` object or a name),
  223. then we will place the parameter immediately before this one.
  224. Otherwise, it will be added at the end. If *before* is a name and
  225. exists multiple times in the template, we will place it before the last
  226. occurrence. If *before* is not in the template, :exc:`ValueError` is
  227. raised. The argument is ignored if *name* is an existing parameter.
  228. If *preserve_spacing* is ``True``, we will try to preserve whitespace
  229. conventions around the parameter, whether it is new or we are updating
  230. an existing value. It is disabled for parameters with hidden keys,
  231. since MediaWiki doesn't strip whitespace in this case.
  232. """
  233. name, value = parse_anything(name), parse_anything(value)
  234. self._surface_escape(value, "|")
  235. if self.has(name):
  236. self.remove(name, keep_field=True)
  237. existing = self.get(name)
  238. if showkey is not None:
  239. existing.showkey = showkey
  240. if not existing.showkey:
  241. self._surface_escape(value, "=")
  242. nodes = existing.value.nodes
  243. if preserve_spacing and existing.showkey:
  244. for i in range(2): # Ignore empty text nodes
  245. if not nodes[i]:
  246. nodes[i] = None
  247. existing.value = parse_anything([nodes[0], value, nodes[1]])
  248. else:
  249. existing.value = value
  250. return existing
  251. if showkey is None:
  252. if Parameter.can_hide_key(name):
  253. int_name = int(str(name))
  254. int_keys = set()
  255. for param in self.params:
  256. if not param.showkey:
  257. int_keys.add(int(str(param.name)))
  258. expected = min(set(range(1, len(int_keys) + 2)) - int_keys)
  259. if expected == int_name:
  260. showkey = False
  261. else:
  262. showkey = True
  263. else:
  264. showkey = True
  265. if not showkey:
  266. self._surface_escape(value, "=")
  267. if preserve_spacing and showkey:
  268. before_n, after_n = self._get_spacing_conventions(use_names=True)
  269. before_v, after_v = self._get_spacing_conventions(use_names=False)
  270. name = parse_anything([before_n, name, after_n])
  271. value = parse_anything([before_v, value, after_v])
  272. param = Parameter(name, value, showkey)
  273. if before:
  274. if not isinstance(before, Parameter):
  275. before = self.get(before)
  276. self.params.insert(self.params.index(before), param)
  277. else:
  278. self.params.append(param)
  279. return param
  280. def remove(self, param, keep_field=False):
  281. """Remove a parameter from the template, identified by *param*.
  282. If *param* is a :class:`.Parameter` object, it will be matched exactly,
  283. otherwise it will be treated like the *name* argument to :meth:`has`
  284. and :meth:`get`.
  285. If *keep_field* is ``True``, we will keep the parameter's name, but
  286. blank its value. Otherwise, we will remove the parameter completely.
  287. When removing a parameter with a hidden name, subsequent parameters
  288. with hidden names will be made visible. For example, removing ``bar``
  289. from ``{{foo|bar|baz}}`` produces ``{{foo|2=baz}}`` because
  290. ``{{foo|baz}}`` is incorrect.
  291. If the parameter shows up multiple times in the template and *param* is
  292. not a :class:`.Parameter` object, we will remove all instances of it
  293. (and keep only one if *keep_field* is ``True`` - either the one with a
  294. hidden name, if it exists, or the first instance).
  295. """
  296. if isinstance(param, Parameter):
  297. return self._remove_exact(param, keep_field)
  298. name = str(param).strip()
  299. removed = False
  300. to_remove = []
  301. for i, param in enumerate(self.params):
  302. if param.name.strip() == name:
  303. if keep_field:
  304. if self._should_remove(i, name):
  305. to_remove.append(i)
  306. else:
  307. self._blank_param_value(param.value)
  308. keep_field = False
  309. else:
  310. self._fix_dependendent_params(i)
  311. to_remove.append(i)
  312. if not removed:
  313. removed = True
  314. if not removed:
  315. raise ValueError(name)
  316. for i in reversed(to_remove):
  317. self.params.pop(i)