A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

wikicode.py 8.9 KiB

12 jaren geleden
12 jaren geleden
12 jaren geleden
12 jaren geleden
12 jaren geleden
12 jaren geleden
12 jaren geleden
12 jaren geleden
12 jaren geleden
12 jaren geleden
12 jaren geleden
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. import re
  24. import sys
  25. from .nodes import Heading, Node, Tag, Template, Text
  26. from .string_mixin import StringMixIn
  27. from .utils import parse_anything
  28. from .compat import str, bytes
  29. __all__ = ["Wikicode"]
  30. FLAGS = re.IGNORECASE | re.DOTALL | re.UNICODE
  31. class Wikicode(StringMixIn):
  32. def __init__(self, nodes):
  33. super(Wikicode, self).__init__()
  34. self._nodes = nodes
  35. def __unicode__(self):
  36. return "".join([str(node) for node in self.nodes])
  37. def _get_children(self, node):
  38. for context, child in node.__iternodes__(self._get_all_nodes):
  39. yield child
  40. def _get_context(self, node, obj):
  41. for context, child in node.__iternodes__(self._get_all_nodes):
  42. if child is obj:
  43. return context
  44. raise ValueError(obj)
  45. def _get_all_nodes(self, code):
  46. for node in code.nodes:
  47. for child in self._get_children(node):
  48. yield child
  49. def _is_equivalent(self, obj, node):
  50. if isinstance(obj, Node):
  51. if node is obj:
  52. return True
  53. else:
  54. if node == obj:
  55. return True
  56. return False
  57. def _contains(self, nodes, obj):
  58. if isinstance(obj, Node):
  59. for node in nodes:
  60. if node is obj:
  61. return True
  62. else:
  63. if obj in nodes:
  64. return True
  65. return False
  66. def _do_search(self, obj, recursive, callback, context, *args, **kwargs):
  67. if recursive:
  68. for i, node in enumerate(context.nodes):
  69. if self._is_equivalent(obj, node):
  70. return callback(context, i, *args, **kwargs)
  71. if self._contains(self._get_children(node), obj):
  72. context = self._get_context(node, obj)
  73. return self._do_search(obj, recursive, callback, context,
  74. *args, **kwargs)
  75. raise ValueError(obj)
  76. callback(context, self.index(obj, recursive=False), *args, **kwargs)
  77. def _get_tree(self, code, lines, marker, indent):
  78. def write(*args):
  79. if lines and lines[-1] is marker: # Continue from the last line
  80. lines.pop() # Remove the marker
  81. last = lines.pop()
  82. lines.append(last + " ".join(args))
  83. else:
  84. lines.append(" " * 6 * indent + " ".join(args))
  85. get = lambda code: self._get_tree(code, lines, marker, indent + 1)
  86. mark = lambda: lines.append(marker)
  87. for node in code.nodes:
  88. node.__showtree__(write, get, mark)
  89. return lines
  90. @property
  91. def nodes(self):
  92. return self._nodes
  93. @nodes.setter
  94. def nodes(self, value):
  95. self._nodes = value
  96. def get(self, index):
  97. return self.nodes[index]
  98. def set(self, index, value):
  99. nodes = parse_anything(value).nodes
  100. if len(nodes) > 1:
  101. raise ValueError("Cannot coerce multiple nodes into one index")
  102. if index >= len(self.nodes) or -1 * index > len(self.nodes):
  103. raise IndexError("List assignment index out of range")
  104. self.nodes.pop(index)
  105. if nodes:
  106. self.nodes[index] = nodes[0]
  107. def index(self, obj, recursive=False):
  108. if recursive:
  109. for i, node in enumerate(self.nodes):
  110. if self._contains(self._get_children(node), obj):
  111. return i
  112. raise ValueError(obj)
  113. for i, node in enumerate(self.nodes):
  114. if self._is_equivalent(obj, node):
  115. return i
  116. raise ValueError(obj)
  117. def insert(self, index, value):
  118. nodes = parse_anything(value).nodes
  119. for node in reversed(nodes):
  120. self.nodes.insert(index, node)
  121. def insert_before(self, obj, value, recursive=True):
  122. callback = lambda self, i, value: self.insert(i, value)
  123. self._do_search(obj, recursive, callback, self, value)
  124. def insert_after(self, obj, value, recursive=True):
  125. callback = lambda self, i, value: self.insert(i + 1, value)
  126. self._do_search(obj, recursive, callback, self, value)
  127. def replace(self, obj, value, recursive=True):
  128. def callback(self, i, value):
  129. self.nodes.pop(i)
  130. self.insert(i, value)
  131. self._do_search(obj, recursive, callback, self, value)
  132. def append(self, value):
  133. nodes = parse_anything(value).nodes
  134. for node in nodes:
  135. self.nodes.append(node)
  136. def remove(self, obj, recursive=True):
  137. callback = lambda self, i: self.nodes.pop(i)
  138. self._do_search(obj, recursive, callback, self)
  139. def ifilter(self, recursive=False, matches=None, flags=FLAGS,
  140. forcetype=None):
  141. if recursive:
  142. nodes = self._get_all_nodes(self)
  143. else:
  144. nodes = self.nodes
  145. for node in nodes:
  146. if not forcetype or isinstance(node, forcetype):
  147. if not matches or re.search(matches, str(node), flags):
  148. yield node
  149. def ifilter_templates(self, recursive=False, matches=None, flags=FLAGS):
  150. return self.filter(recursive, matches, flags, forcetype=Template)
  151. def ifilter_text(self, recursive=False, matches=None, flags=FLAGS):
  152. return self.filter(recursive, matches, flags, forcetype=Text)
  153. def ifilter_tags(self, recursive=False, matches=None, flags=FLAGS):
  154. return self.ifilter(recursive, matches, flags, forcetype=Tag)
  155. def filter(self, recursive=False, matches=None, flags=FLAGS,
  156. forcetype=None):
  157. return list(self.ifilter(recursive, matches, flags, forcetype))
  158. def filter_templates(self, recursive=False, matches=None, flags=FLAGS):
  159. return list(self.ifilter_templates(recursive, matches, flags))
  160. def filter_text(self, recursive=False, matches=None, flags=FLAGS):
  161. return list(self.ifilter_text(recursive, matches, flags))
  162. def filter_tags(self, recursive=False, matches=None, flags=FLAGS):
  163. return list(self.ifilter_tags(recursive, matches, flags))
  164. def get_sections(self, flat=True, matches=None, levels=None, flags=FLAGS,
  165. include_headings=True):
  166. if matches:
  167. matches = r"^(=+?)\s*" + matches + r"\s*\1$"
  168. headings = self.filter(recursive=True, matches=matches, flags=flags,
  169. forcetype=Heading)
  170. if levels:
  171. headings = [head for head in headings if head.level in levels]
  172. sections = []
  173. buffers = [[sys.maxint, 0]]
  174. i = 0
  175. while i < len(self.nodes):
  176. if self.nodes[i] in headings:
  177. this = self.nodes[i].level
  178. for (level, start) in buffers:
  179. if not flat or this <= level:
  180. buffers.remove([level, start])
  181. sections.append(self.nodes[start:i])
  182. buffers.append([this, i])
  183. if not include_headings:
  184. i += 1
  185. i += 1
  186. for (level, start) in buffers:
  187. if start != i:
  188. sections.append(self.nodes[start:i])
  189. return sections
  190. def strip_code(self, normalize=True, collapse=True):
  191. nodes = []
  192. for node in self.nodes:
  193. stripped = node.__strip__(normalize, collapse)
  194. if stripped:
  195. nodes.append(str(stripped))
  196. if collapse:
  197. stripped = "".join(nodes).strip("\n")
  198. while "\n\n\n" in stripped:
  199. stripped = stripped.replace("\n\n\n", "\n\n")
  200. return stripped
  201. else:
  202. return "".join(nodes)
  203. def get_tree(self):
  204. marker = object() # Random object we can find with certainty in a list
  205. return "\n".join(self._get_tree(self, [], marker, 0))