A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

304 lines
12 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from collections import defaultdict
  24. import re
  25. from . import HTMLEntity, Node, Text
  26. from .extras import Parameter
  27. from ..compat import basestring, str
  28. from ..utils import parse_anything
  29. __all__ = ["Template"]
  30. FLAGS = re.DOTALL | re.UNICODE
  31. class Template(Node):
  32. """Represents a template in wikicode, like ``{{foo}}``."""
  33. def __init__(self, name, params=None):
  34. super(Template, self).__init__()
  35. self._name = name
  36. if params:
  37. self._params = params
  38. else:
  39. self._params = []
  40. def __unicode__(self):
  41. if self.params:
  42. params = "|".join([str(param) for param in self.params])
  43. return "{{" + str(self.name) + "|" + params + "}}"
  44. else:
  45. return "{{" + str(self.name) + "}}"
  46. def __iternodes__(self, getter):
  47. yield None, self
  48. for child in getter(self.name):
  49. yield self.name, child
  50. for param in self.params:
  51. if param.showkey:
  52. for child in getter(param.name):
  53. yield param.name, child
  54. for child in getter(param.value):
  55. yield param.value, child
  56. def __showtree__(self, write, get, mark):
  57. write("{{")
  58. get(self.name)
  59. for param in self.params:
  60. write(" | ")
  61. mark()
  62. get(param.name)
  63. write(" = ")
  64. mark()
  65. get(param.value)
  66. write("}}")
  67. def _surface_escape(self, code, char):
  68. """Return *code* with *char* escaped as an HTML entity.
  69. The main use of this is to escape pipes (``|``) or equal signs (``=``)
  70. in parameter names or values so they are not mistaken for new
  71. parameters.
  72. """
  73. replacement = HTMLEntity(value=ord(char))
  74. for node in code.filter_text(recursive=False):
  75. if char in node:
  76. code.replace(node, node.replace(char, replacement))
  77. def _blank_param_value(self, value):
  78. """Remove the content from *value* while keeping its whitespace.
  79. Replace *value*\ 's nodes with two text nodes, the first containing
  80. whitespace from before its content and the second containing whitespace
  81. from after its content.
  82. """
  83. match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS)
  84. value.nodes = [Text(match.group(1)), Text(match.group(2))]
  85. def _select_theory(self, theories):
  86. """Return the most likely spacing convention given different options.
  87. Given a dictionary of convention options as keys and their occurance as
  88. values, return the convention that occurs the most, or ``None`` if
  89. there is no clear preferred style.
  90. """
  91. if theories:
  92. values = tuple(theories.values())
  93. best = max(values)
  94. confidence = float(best) / sum(values)
  95. if confidence > 0.75:
  96. return tuple(theories.keys())[values.index(best)]
  97. def _get_spacing_conventions(self, use_names):
  98. """Try to determine the whitespace conventions for parameters.
  99. This will examine the existing parameters and use
  100. :py:meth:`_select_theory` to determine if there are any preferred
  101. styles for how much whitespace to put before or after the value.
  102. """
  103. before_theories = defaultdict(lambda: 0)
  104. after_theories = defaultdict(lambda: 0)
  105. for param in self.params:
  106. if use_names:
  107. component = str(param.name)
  108. else:
  109. component = str(param.value)
  110. match = re.search(r"^(\s*).*?(\s*)$", component, FLAGS)
  111. before, after = match.group(1), match.group(2)
  112. before_theories[before] += 1
  113. after_theories[after] += 1
  114. before = self._select_theory(before_theories)
  115. after = self._select_theory(after_theories)
  116. return before, after
  117. def _remove_with_field(self, param, i, name):
  118. """Return True if a parameter name should be kept, otherwise False."""
  119. if param.showkey:
  120. following = self.params[i+1:]
  121. better_matches = [after.name.strip() == name and not after.showkey for after in following]
  122. if any(better_matches):
  123. return False
  124. return True
  125. def _remove_without_field(self, param, i, force_no_field):
  126. """Return False if a parameter name should be kept, otherwise True."""
  127. if not param.showkey and not force_no_field:
  128. dependents = [not after.showkey for after in self.params[i+1:]]
  129. if any(dependents):
  130. return False
  131. return True
  132. @property
  133. def name(self):
  134. """The name of the template, as a ``Wikicode`` object."""
  135. return self._name
  136. @property
  137. def params(self):
  138. """The list of parameters contained within the template."""
  139. return self._params
  140. @name.setter
  141. def name(self, value):
  142. self._name = parse_anything(value)
  143. def has_param(self, name, ignore_empty=True):
  144. """Return ``True`` if any parameter in the template is named *name*.
  145. With *ignore_empty*, ``False`` will be returned even if the template
  146. contains a parameter with the name *name*, if the parameter's value
  147. is empty. Note that a template may have multiple parameters with the
  148. same name.
  149. """
  150. name = name.strip() if isinstance(name, basestring) else str(name)
  151. for param in self.params:
  152. if param.name.strip() == name:
  153. if ignore_empty and not param.value.strip():
  154. continue
  155. return True
  156. return False
  157. def get(self, name):
  158. """Get the parameter whose name is *name*.
  159. The returned object is a
  160. :py:class:`~mwparserfromhell.nodes.extras.parameter.Parameter`
  161. instance. Raises :py:exc:`ValueError` if no parameter has this name.
  162. Since multiple parameters can have the same name, we'll return the last
  163. match, since the last parameter is the only one read by the MediaWiki
  164. parser.
  165. """
  166. name = name.strip() if isinstance(name, basestring) else str(name)
  167. for param in reversed(self.params):
  168. if param.name.strip() == name:
  169. return param
  170. raise ValueError(name)
  171. def add(self, name, value, showkey=None, force_nonconformity=False):
  172. """Add a parameter to the template with a given *name* and *value*.
  173. *name* and *value* can be anything parasable by
  174. :py:func:`mwparserfromhell.utils.parse_anything`; pipes (and equal
  175. signs, if appropriate) are automatically escaped from *value* where
  176. applicable. If *showkey* is given, this will determine whether or not
  177. to show the parameter's name (e.g., ``{{foo|bar}}``'s parameter has a
  178. name of ``"1"`` but it is hidden); otherwise, we'll make a safe and
  179. intelligent guess. If *name* is already a parameter, we'll replace its
  180. value while keeping the same spacing rules unless *force_nonconformity*
  181. is ``True``. We will also try to guess the dominant spacing convention
  182. when adding a new parameter using :py:meth:`_get_spacing_conventions`
  183. unless *force_nonconformity* is ``True``.
  184. """
  185. name, value = parse_anything(name), parse_anything(value)
  186. self._surface_escape(value, "|")
  187. if self.has_param(name):
  188. self.remove(name, keep_field=True)
  189. existing = self.get(name)
  190. if showkey is not None:
  191. if not showkey:
  192. self._surface_escape(value, "=")
  193. existing.showkey = showkey
  194. nodes = existing.value.nodes
  195. if force_nonconformity:
  196. existing.value = value
  197. else:
  198. existing.value = parse_anything([nodes[0], value, nodes[1]])
  199. return existing
  200. if showkey is None:
  201. try:
  202. int_name = int(str(name))
  203. except ValueError:
  204. showkey = True
  205. else:
  206. int_keys = set()
  207. for param in self.params:
  208. if not param.showkey:
  209. if re.match(r"[1-9][0-9]*$", param.name.strip()):
  210. int_keys.add(int(str(param.name)))
  211. expected = min(set(range(1, len(int_keys) + 2)) - int_keys)
  212. if expected == int_name:
  213. showkey = False
  214. else:
  215. showkey = True
  216. if not showkey:
  217. self._surface_escape(value, "=")
  218. if not force_nonconformity:
  219. before_n, after_n = self._get_spacing_conventions(use_names=True)
  220. if before_n and after_n:
  221. name = parse_anything([before_n, value, after_n])
  222. elif before_n:
  223. name = parse_anything([before_n, value])
  224. elif after_n:
  225. name = parse_anything([value, after_n])
  226. before_v, after_v = self._get_spacing_conventions(use_names=False)
  227. if before_v and after_v:
  228. value = parse_anything([before_v, value, after_v])
  229. elif before_v:
  230. value = parse_anything([before_v, value])
  231. elif after_v:
  232. value = parse_anything([value, after_v])
  233. param = Parameter(name, value, showkey)
  234. self.params.append(param)
  235. return param
  236. def remove(self, name, keep_field=False, force_no_field=False):
  237. """Remove a parameter from the template whose name is *name*.
  238. If *keep_field* is ``True``, we will keep the parameter's name, but
  239. blank its value. Otherwise, we will remove the parameter completely
  240. *unless* other parameters are dependent on it (e.g. removing ``bar``
  241. from ``{{foo|bar|baz}}`` is unsafe because ``{{foo|baz}}`` is not what
  242. we expected, so ``{{foo||baz}}`` will be produced instead), unless
  243. *force_no_field* is also ``True``. If the parameter shows up multiple
  244. times in the template, we will remove all instances of it (and keep
  245. one if *keep_field* is ``True`` - that being the first instance if
  246. none of the instances have dependents, otherwise that instance will be
  247. kept).
  248. """
  249. name = name.strip() if isinstance(name, basestring) else str(name)
  250. removed = False
  251. for i, param in enumerate(self.params):
  252. if param.name.strip() == name:
  253. if keep_field:
  254. if self._remove_with_field(param, i, name):
  255. self._blank_param_value(param.value)
  256. keep_field = False
  257. else:
  258. self.params.remove(param)
  259. else:
  260. if self._remove_without_field(param, i, force_no_field):
  261. self.params.remove(param)
  262. else:
  263. self._blank_param_value(param.value)
  264. if not removed:
  265. removed = True
  266. if not removed:
  267. raise ValueError(name)