A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

273 lines
9.4 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from . import Node
  24. from .extras import Attribute
  25. from ..compat import str
  26. from ..definitions import is_visible
  27. from ..utils import parse_anything
  28. __all__ = ["Tag"]
  29. class Tag(Node):
  30. """Represents an HTML-style tag in wikicode, like ``<ref>``."""
  31. def __init__(self, tag, contents=None, attrs=None, wiki_markup=None,
  32. self_closing=False, invalid=False, implicit=False, padding="",
  33. closing_tag=None):
  34. super(Tag, self).__init__()
  35. self._tag = tag
  36. if contents is None and not self_closing:
  37. self._contents = parse_anything("")
  38. else:
  39. self._contents = contents
  40. self._attrs = attrs if attrs else []
  41. self._wiki_markup = wiki_markup
  42. self._self_closing = self_closing
  43. self._invalid = invalid
  44. self._implicit = implicit
  45. self._padding = padding
  46. if closing_tag:
  47. self._closing_tag = closing_tag
  48. else:
  49. self._closing_tag = tag
  50. def __unicode__(self):
  51. if self.wiki_markup:
  52. if self.self_closing:
  53. return self.wiki_markup
  54. else:
  55. return self.wiki_markup + str(self.contents) + self.wiki_markup
  56. result = ("</" if self.invalid else "<") + str(self.tag)
  57. if self.attributes:
  58. result += "".join([str(attr) for attr in self.attributes])
  59. if self.self_closing:
  60. result += self.padding + (">" if self.implicit else "/>")
  61. else:
  62. result += self.padding + ">" + str(self.contents)
  63. result += "</" + str(self.closing_tag) + ">"
  64. return result
  65. def __iternodes__(self, getter):
  66. yield None, self
  67. if not self.wiki_markup:
  68. for child in getter(self.tag):
  69. yield self.tag, child
  70. for attr in self.attributes:
  71. for child in getter(attr.name):
  72. yield attr.name, child
  73. if attr.value:
  74. for child in getter(attr.value):
  75. yield attr.value, child
  76. if self.contents:
  77. for child in getter(self.contents):
  78. yield self.contents, child
  79. if not self.self_closing and not self.wiki_markup and self.closing_tag:
  80. for child in getter(self.closing_tag):
  81. yield self.closing_tag, child
  82. def __strip__(self, normalize, collapse):
  83. if self.contents and is_visible(self.tag):
  84. return self.contents.strip_code(normalize, collapse)
  85. return None
  86. def __showtree__(self, write, get, mark):
  87. write("</" if self.invalid else "<")
  88. get(self.tag)
  89. for attr in self.attributes:
  90. get(attr.name)
  91. if not attr.value:
  92. continue
  93. write(" = ")
  94. mark()
  95. get(attr.value)
  96. if self.self_closing:
  97. write(">" if self.implicit else "/>")
  98. else:
  99. write(">")
  100. get(self.contents)
  101. write("</")
  102. get(self.closing_tag)
  103. write(">")
  104. @property
  105. def tag(self):
  106. """The tag itself, as a :py:class:`~.Wikicode` object."""
  107. return self._tag
  108. @property
  109. def contents(self):
  110. """The contents of the tag, as a :py:class:`~.Wikicode` object."""
  111. return self._contents
  112. @property
  113. def attributes(self):
  114. """The list of attributes affecting the tag.
  115. Each attribute is an instance of :py:class:`~.Attribute`.
  116. """
  117. return self._attrs
  118. @property
  119. def wiki_markup(self):
  120. """The wikified version of a tag to show instead of HTML.
  121. If set to a value, this will be displayed instead of the brackets.
  122. For example, set to ``''`` to replace ``<i>`` or ``----`` to replace
  123. ``<hr>``.
  124. """
  125. return self._wiki_markup
  126. @property
  127. def self_closing(self):
  128. """Whether the tag is self-closing with no content (like ``<br/>``)."""
  129. return self._self_closing
  130. @property
  131. def invalid(self):
  132. """Whether the tag starts with a backslash after the opening bracket.
  133. This makes the tag look like a lone close tag. It is technically
  134. invalid and is only parsable Wikicode when the tag itself is
  135. single-only, like ``<br>`` and ``<img>``. See
  136. :py:func:`.definitions.is_single_only`.
  137. """
  138. return self._invalid
  139. @property
  140. def implicit(self):
  141. """Whether the tag is implicitly self-closing, with no ending slash.
  142. This is only possible for specific "single" tags like ``<br>`` and
  143. ``<li>``. See :py:func:`.definitions.is_single`. This field only has an
  144. effect if :py:attr:`self_closing` is also ``True``.
  145. """
  146. return self._implicit
  147. @property
  148. def padding(self):
  149. """Spacing to insert before the first closing ``>``."""
  150. return self._padding
  151. @property
  152. def closing_tag(self):
  153. """The closing tag, as a :py:class:`~.Wikicode` object.
  154. This will usually equal :py:attr:`tag`, unless there is additional
  155. spacing, comments, or the like.
  156. """
  157. return self._closing_tag
  158. @tag.setter
  159. def tag(self, value):
  160. self._tag = self._closing_tag = parse_anything(value)
  161. @contents.setter
  162. def contents(self, value):
  163. self._contents = parse_anything(value)
  164. @wiki_markup.setter
  165. def wiki_markup(self, value):
  166. self._wiki_markup = str(value) if value else None
  167. @self_closing.setter
  168. def self_closing(self, value):
  169. self._self_closing = bool(value)
  170. @invalid.setter
  171. def invalid(self, value):
  172. self._invalid = bool(value)
  173. @implicit.setter
  174. def implicit(self, value):
  175. self._implicit = bool(value)
  176. @padding.setter
  177. def padding(self, value):
  178. if not value:
  179. self._padding = ""
  180. else:
  181. value = str(value)
  182. if not value.isspace():
  183. raise ValueError("padding must be entirely whitespace")
  184. self._padding = value
  185. @closing_tag.setter
  186. def closing_tag(self, value):
  187. self._closing_tag = parse_anything(value)
  188. def has(self, name):
  189. """Return whether any attribute in the tag has the given *name*.
  190. Note that a tag may have multiple attributes with the same name, but
  191. only the last one is read by the MediaWiki parser.
  192. """
  193. for attr in self.attributes:
  194. if attr.name == name.strip():
  195. return True
  196. return False
  197. def get(self, name):
  198. """Get the attribute with the given *name*.
  199. The returned object is a :py:class:`~.Attribute` instance. Raises
  200. :py:exc:`ValueError` if no attribute has this name. Since multiple
  201. attributes can have the same name, we'll return the last match, since
  202. all but the last are ignored by the MediaWiki parser.
  203. """
  204. for attr in reversed(self.attributes):
  205. if attr.name == name.strip():
  206. return attr
  207. raise ValueError(name)
  208. def add(self, name, value=None, quoted=True, pad_first=" ",
  209. pad_before_eq="", pad_after_eq=""):
  210. """Add an attribute with the given *name* and *value*.
  211. *name* and *value* can be anything parasable by
  212. :py:func:`.utils.parse_anything`; *value* can be omitted if the
  213. attribute is valueless. *quoted* is a bool telling whether to wrap the
  214. *value* in double quotes (this is recommended). *pad_first*,
  215. *pad_before_eq*, and *pad_after_eq* are whitespace used as padding
  216. before the name, before the equal sign (or after the name if no value),
  217. and after the equal sign (ignored if no value), respectively.
  218. """
  219. if value is not None:
  220. value = parse_anything(value)
  221. attr = Attribute(parse_anything(name), value, quoted)
  222. attr.pad_first = pad_first
  223. attr.pad_before_eq = pad_before_eq
  224. attr.pad_after_eq = pad_after_eq
  225. self.attributes.append(attr)
  226. return attr
  227. def remove(self, name):
  228. """Remove all attributes with the given *name*."""
  229. attrs = [attr for attr in self.attributes if attr.name == name.strip()]
  230. if not attrs:
  231. raise ValueError(name)
  232. for attr in attrs:
  233. self.attributes.remove(attr)