A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

wikicode.py 9.8 KiB

11 jaren geleden
11 jaren geleden
12 jaren geleden
11 jaren geleden
11 jaren geleden
11 jaren geleden
11 jaren geleden
12 jaren geleden
11 jaren geleden
11 jaren geleden
11 jaren geleden
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. import re
  24. import sys
  25. from .nodes import Heading, Node, Tag, Template, Text
  26. from .string_mixin import StringMixIn
  27. from .utils import parse_anything
  28. from .compat import str, bytes
  29. __all__ = ["Wikicode"]
  30. FLAGS = re.IGNORECASE | re.DOTALL | re.UNICODE
  31. class Wikicode(StringMixIn):
  32. """A ``Wikicode`` is a container for nodes that functions like a string.
  33. """
  34. def __init__(self, nodes):
  35. super(Wikicode, self).__init__()
  36. self._nodes = nodes
  37. def __unicode__(self):
  38. return "".join([str(node) for node in self.nodes])
  39. def _get_children(self, node):
  40. """Iterate over all descendants of a given node, including itself.
  41. This is implemented by the __iternodes__() generator of Node classes,
  42. which by default yields itself and nothing more.
  43. """
  44. for context, child in node.__iternodes__(self._get_all_nodes):
  45. yield child
  46. def _get_context(self, node, obj):
  47. """Return a ``Wikicode`` that contains ``obj`` in its descendants.
  48. The closest (shortest distance from ``node``) suitable ``Wikicode``
  49. will be returned, or ``None`` if the ``obj`` is the ``node`` itself.
  50. Raises ``ValueError`` if ``obj`` is not within ``node``.
  51. """
  52. for context, child in node.__iternodes__(self._get_all_nodes):
  53. if child is obj:
  54. return context
  55. raise ValueError(obj)
  56. def _get_all_nodes(self, code):
  57. """Iterate over all of our descendant nodes.
  58. This is implemented by calling :py:meth:`_get_children` on every node
  59. in our node list (:py:attr:`self.nodes <nodes>`).
  60. """
  61. for node in code.nodes:
  62. for child in self._get_children(node):
  63. yield child
  64. def _is_equivalent(self, obj, node):
  65. """Return ``True`` if obj and node are equivalent, otherwise ``False``.
  66. """
  67. if isinstance(obj, Node):
  68. if node is obj:
  69. return True
  70. else:
  71. if node == obj:
  72. return True
  73. return False
  74. def _contains(self, nodes, obj):
  75. if isinstance(obj, Node):
  76. for node in nodes:
  77. if node is obj:
  78. return True
  79. else:
  80. if obj in nodes:
  81. return True
  82. return False
  83. def _do_search(self, obj, recursive, callback, context, *args, **kwargs):
  84. if recursive:
  85. for i, node in enumerate(context.nodes):
  86. if self._is_equivalent(obj, node):
  87. return callback(context, i, *args, **kwargs)
  88. if self._contains(self._get_children(node), obj):
  89. context = self._get_context(node, obj)
  90. return self._do_search(obj, recursive, callback, context,
  91. *args, **kwargs)
  92. raise ValueError(obj)
  93. callback(context, self.index(obj, recursive=False), *args, **kwargs)
  94. def _get_tree(self, code, lines, marker, indent):
  95. def write(*args):
  96. if lines and lines[-1] is marker: # Continue from the last line
  97. lines.pop() # Remove the marker
  98. last = lines.pop()
  99. lines.append(last + " ".join(args))
  100. else:
  101. lines.append(" " * 6 * indent + " ".join(args))
  102. get = lambda code: self._get_tree(code, lines, marker, indent + 1)
  103. mark = lambda: lines.append(marker)
  104. for node in code.nodes:
  105. node.__showtree__(write, get, mark)
  106. return lines
  107. @property
  108. def nodes(self):
  109. return self._nodes
  110. @nodes.setter
  111. def nodes(self, value):
  112. self._nodes = value
  113. def get(self, index):
  114. return self.nodes[index]
  115. def set(self, index, value):
  116. nodes = parse_anything(value).nodes
  117. if len(nodes) > 1:
  118. raise ValueError("Cannot coerce multiple nodes into one index")
  119. if index >= len(self.nodes) or -1 * index > len(self.nodes):
  120. raise IndexError("List assignment index out of range")
  121. self.nodes.pop(index)
  122. if nodes:
  123. self.nodes[index] = nodes[0]
  124. def index(self, obj, recursive=False):
  125. if recursive:
  126. for i, node in enumerate(self.nodes):
  127. if self._contains(self._get_children(node), obj):
  128. return i
  129. raise ValueError(obj)
  130. for i, node in enumerate(self.nodes):
  131. if self._is_equivalent(obj, node):
  132. return i
  133. raise ValueError(obj)
  134. def insert(self, index, value):
  135. nodes = parse_anything(value).nodes
  136. for node in reversed(nodes):
  137. self.nodes.insert(index, node)
  138. def insert_before(self, obj, value, recursive=True):
  139. callback = lambda self, i, value: self.insert(i, value)
  140. self._do_search(obj, recursive, callback, self, value)
  141. def insert_after(self, obj, value, recursive=True):
  142. callback = lambda self, i, value: self.insert(i + 1, value)
  143. self._do_search(obj, recursive, callback, self, value)
  144. def replace(self, obj, value, recursive=True):
  145. def callback(self, i, value):
  146. self.nodes.pop(i)
  147. self.insert(i, value)
  148. self._do_search(obj, recursive, callback, self, value)
  149. def append(self, value):
  150. nodes = parse_anything(value).nodes
  151. for node in nodes:
  152. self.nodes.append(node)
  153. def remove(self, obj, recursive=True):
  154. callback = lambda self, i: self.nodes.pop(i)
  155. self._do_search(obj, recursive, callback, self)
  156. def ifilter(self, recursive=False, matches=None, flags=FLAGS,
  157. forcetype=None):
  158. if recursive:
  159. nodes = self._get_all_nodes(self)
  160. else:
  161. nodes = self.nodes
  162. for node in nodes:
  163. if not forcetype or isinstance(node, forcetype):
  164. if not matches or re.search(matches, str(node), flags):
  165. yield node
  166. def ifilter_templates(self, recursive=False, matches=None, flags=FLAGS):
  167. return self.filter(recursive, matches, flags, forcetype=Template)
  168. def ifilter_text(self, recursive=False, matches=None, flags=FLAGS):
  169. return self.filter(recursive, matches, flags, forcetype=Text)
  170. def ifilter_tags(self, recursive=False, matches=None, flags=FLAGS):
  171. return self.ifilter(recursive, matches, flags, forcetype=Tag)
  172. def filter(self, recursive=False, matches=None, flags=FLAGS,
  173. forcetype=None):
  174. return list(self.ifilter(recursive, matches, flags, forcetype))
  175. def filter_templates(self, recursive=False, matches=None, flags=FLAGS):
  176. return list(self.ifilter_templates(recursive, matches, flags))
  177. def filter_text(self, recursive=False, matches=None, flags=FLAGS):
  178. return list(self.ifilter_text(recursive, matches, flags))
  179. def filter_tags(self, recursive=False, matches=None, flags=FLAGS):
  180. return list(self.ifilter_tags(recursive, matches, flags))
  181. def get_sections(self, flat=True, matches=None, levels=None, flags=FLAGS,
  182. include_headings=True):
  183. if matches:
  184. matches = r"^(=+?)\s*" + matches + r"\s*\1$"
  185. headings = self.filter(recursive=True, matches=matches, flags=flags,
  186. forcetype=Heading)
  187. if levels:
  188. headings = [head for head in headings if head.level in levels]
  189. sections = []
  190. buffers = [[sys.maxint, 0]]
  191. i = 0
  192. while i < len(self.nodes):
  193. if self.nodes[i] in headings:
  194. this = self.nodes[i].level
  195. for (level, start) in buffers:
  196. if not flat or this <= level:
  197. buffers.remove([level, start])
  198. sections.append(self.nodes[start:i])
  199. buffers.append([this, i])
  200. if not include_headings:
  201. i += 1
  202. i += 1
  203. for (level, start) in buffers:
  204. if start != i:
  205. sections.append(self.nodes[start:i])
  206. return sections
  207. def strip_code(self, normalize=True, collapse=True):
  208. nodes = []
  209. for node in self.nodes:
  210. stripped = node.__strip__(normalize, collapse)
  211. if stripped:
  212. nodes.append(str(stripped))
  213. if collapse:
  214. stripped = "".join(nodes).strip("\n")
  215. while "\n\n\n" in stripped:
  216. stripped = stripped.replace("\n\n\n", "\n\n")
  217. return stripped
  218. else:
  219. return "".join(nodes)
  220. def get_tree(self):
  221. marker = object() # Random object we can find with certainty in a list
  222. return "\n".join(self._get_tree(self, [], marker, 0))