A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

150 line
5.7 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from collections import defaultdict
  23. import re
  24. from mwparserfromhell.nodes import HTMLEntity, Node, Text
  25. from mwparserfromhell.nodes.extras import Parameter
  26. from mwparserfromhell.utils import parse_anything
  27. __all__ = ["Template"]
  28. class Template(Node):
  29. def __init__(self, name, params=None):
  30. self._name = name
  31. if params:
  32. self._params = params
  33. else:
  34. self._params = []
  35. def __unicode__(self):
  36. if self.params:
  37. params = u"|".join([unicode(param) for param in self.params])
  38. return "{{" + unicode(self.name) + "|" + params + "}}"
  39. else:
  40. return "{{" + unicode(self.name) + "}}"
  41. def _surface_escape(self, code, char):
  42. replacement = HTMLEntity(value=ord(char))
  43. for node in code.filter_text(recursive=False):
  44. if char in node:
  45. code.replace(node, node.replace(char, replacement))
  46. def _blank_param_value(self, value):
  47. match = re.search("^(\s*).*?(\s*)$", value, re.DOTALL|re.UNICODE)
  48. value.nodes = [Text(match.group(1)), Text(match.group(2))]
  49. def _select_theory(self, theories):
  50. if theories:
  51. best = max(theories.values())
  52. confidence = float(best) / sum(theories.values())
  53. if confidence > 0.75:
  54. return theories.keys()[theories.values().index(best)]
  55. def _get_spacing_conventions(self):
  56. before_theories = defaultdict(lambda: 0)
  57. after_theories = defaultdict(lambda: 0)
  58. for param in self.params:
  59. match = re.search("^(\s*).*?(\s*)$", param.value, re.S|re.U)
  60. before, after = match.group(1), match.group(2)
  61. before_theories[before] += 1
  62. after_theories[after] += 1
  63. before = self._select_theory(before_theories)
  64. after = self._select_theory(after_theories)
  65. return before, after
  66. @property
  67. def name(self):
  68. return self._name
  69. @property
  70. def params(self):
  71. return self._params
  72. def has_param(self, name, ignore_empty=True):
  73. name = name.strip() if isinstance(name, basestring) else unicode(name)
  74. for param in self.params:
  75. if param.name.strip() == name:
  76. if ignore_empty and not param.value.strip():
  77. continue
  78. return True
  79. return False
  80. def get(self, name):
  81. name = name.strip() if isinstance(name, basestring) else unicode(name)
  82. for param in self.params:
  83. if param.name.strip() == name:
  84. return param
  85. raise ValueError(name)
  86. def add(self, name, value, showkey=None, force_nonconformity=False):
  87. name, value = parse_anything(name), parse_anything(value)
  88. self._surface_escape(value, "|")
  89. if self.has_param(name):
  90. self.remove(name, keep_field=True)
  91. existing = self.get(name)
  92. if showkey is None: # Infer showkey from current value
  93. showkey = existing.showkey
  94. if not showkey:
  95. self._surface_escape(value, "=")
  96. nodes = existing.value.nodes
  97. if force_nonconformity:
  98. existing.value = value
  99. else:
  100. existing.value = parse_anything([nodes[0], value, nodes[1]])
  101. return existing
  102. if showkey is None:
  103. try:
  104. int(name)
  105. showkey = True
  106. except ValueError:
  107. showkey = False
  108. if not showkey:
  109. self._surface_escape(value, "=")
  110. if not force_nonconformity:
  111. before, after = self._get_spacing_conventions()
  112. if before and after:
  113. value = parse_anything([before, value, after])
  114. elif before:
  115. value = parse_anything([before, value])
  116. elif after:
  117. value = parse_anything([value, after])
  118. param = Parameter(name, value, showkey)
  119. self.params.append(param)
  120. return param
  121. def remove(self, name, keep_field=False, force_no_field=False):
  122. name = name.strip() if isinstance(name, basestring) else unicode(name)
  123. for i, param in enumerate(self.params):
  124. if param.name.strip() == name:
  125. if keep_field:
  126. return self._blank_param_value(param.value)
  127. dependent = [not after.showkey for after in self.params[i+1:]]
  128. if any(dependent) and not param.showkey and not force_no_field:
  129. return self._blank_param_value(param.value)
  130. return self.params.remove(param)
  131. raise ValueError(name)