A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

11 лет назад
11 лет назад
11 лет назад
11 лет назад
5 лет назад
11 лет назад
11 лет назад
11 лет назад
5 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
11 лет назад
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680
  1. #
  2. # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included in
  12. # all copies or substantial portions of the Software.
  13. #
  14. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20. # SOFTWARE.
  21. import re
  22. from itertools import chain
  23. from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity,
  24. Node, Tag, Template, Text, Wikilink)
  25. from .smart_list.list_proxy import ListProxy
  26. from .string_mixin import StringMixIn
  27. from .utils import parse_anything
  28. __all__ = ["Wikicode"]
  29. FLAGS = re.IGNORECASE | re.DOTALL | re.UNICODE
  30. class Wikicode(StringMixIn):
  31. """A ``Wikicode`` is a container for nodes that operates like a string.
  32. Additionally, it contains methods that can be used to extract data from or
  33. modify the nodes, implemented in an interface similar to a list. For
  34. example, :meth:`index` can get the index of a node in the list, and
  35. :meth:`insert` can add a new node at that index. The :meth:`filter()
  36. <ifilter>` series of functions is very useful for extracting and iterating
  37. over, for example, all of the templates in the object.
  38. """
  39. RECURSE_OTHERS = 2
  40. def __init__(self, nodes):
  41. super().__init__()
  42. self._nodes = nodes
  43. def __str__(self):
  44. return "".join([str(node) for node in self.nodes])
  45. @staticmethod
  46. def _get_children(node, contexts=False, restrict=None, parent=None):
  47. """Iterate over all child :class:`.Node`\\ s of a given *node*."""
  48. yield (parent, node) if contexts else node
  49. if restrict and isinstance(node, restrict):
  50. return
  51. for code in node.__children__():
  52. for child in code.nodes:
  53. sub = Wikicode._get_children(child, contexts, restrict, code)
  54. yield from sub
  55. @staticmethod
  56. def _slice_replace(code, index, old, new):
  57. """Replace the string *old* with *new* across *index* in *code*."""
  58. nodes = [str(node) for node in code.get(index)]
  59. substring = "".join(nodes).replace(old, new)
  60. code.nodes[index] = parse_anything(substring).nodes
  61. @staticmethod
  62. def _build_matcher(matches, flags):
  63. """Helper for :meth:`_indexed_ifilter` and others.
  64. If *matches* is a function, return it. If it's a regex, return a
  65. wrapper around it that can be called with a node to do a search. If
  66. it's ``None``, return a function that always returns ``True``.
  67. """
  68. if matches:
  69. if callable(matches):
  70. return matches
  71. return lambda obj: re.search(matches, str(obj), flags)
  72. return lambda obj: True
  73. def _indexed_ifilter(self, recursive=True, matches=None, flags=FLAGS,
  74. forcetype=None):
  75. """Iterate over nodes and their corresponding indices in the node list.
  76. The arguments are interpreted as for :meth:`ifilter`. For each tuple
  77. ``(i, node)`` yielded by this method, ``self.index(node) == i``. Note
  78. that if *recursive* is ``True``, ``self.nodes[i]`` might not be the
  79. node itself, but will still contain it.
  80. """
  81. match = self._build_matcher(matches, flags)
  82. if recursive:
  83. restrict = forcetype if recursive == self.RECURSE_OTHERS else None
  84. def getter(i, node):
  85. for ch in self._get_children(node, restrict=restrict):
  86. yield (i, ch)
  87. inodes = chain(*(getter(i, n) for i, n in enumerate(self.nodes)))
  88. else:
  89. inodes = enumerate(self.nodes)
  90. for i, node in inodes:
  91. if (not forcetype or isinstance(node, forcetype)) and match(node):
  92. yield (i, node)
  93. def _is_child_wikicode(self, obj, recursive=True):
  94. """Return whether the given :class:`.Wikicode` is a descendant."""
  95. def deref(nodes):
  96. if isinstance(nodes, ListProxy):
  97. return nodes._parent # pylint: disable=protected-access
  98. return nodes
  99. target = deref(obj.nodes)
  100. if target is deref(self.nodes):
  101. return True
  102. if recursive:
  103. todo = [self]
  104. while todo:
  105. code = todo.pop()
  106. if target is deref(code.nodes):
  107. return True
  108. for node in code.nodes:
  109. todo += list(node.__children__())
  110. return False
  111. def _do_strong_search(self, obj, recursive=True):
  112. """Search for the specific element *obj* within the node list.
  113. *obj* can be either a :class:`.Node` or a :class:`.Wikicode` object. If
  114. found, we return a tuple (*context*, *index*) where *context* is the
  115. :class:`.Wikicode` that contains *obj* and *index* is its index there,
  116. as a :class:`slice`. Note that if *recursive* is ``False``, *context*
  117. will always be ``self`` (since we only look for *obj* among immediate
  118. descendants), but if *recursive* is ``True``, then it could be any
  119. :class:`.Wikicode` contained by a node within ``self``. If *obj* is not
  120. found, :exc:`ValueError` is raised.
  121. """
  122. if isinstance(obj, Wikicode):
  123. if not self._is_child_wikicode(obj, recursive):
  124. raise ValueError(obj)
  125. return obj, slice(0, len(obj.nodes))
  126. if isinstance(obj, Node):
  127. mkslice = lambda i: slice(i, i + 1)
  128. if not recursive:
  129. return self, mkslice(self.index(obj))
  130. for node in self.nodes:
  131. for context, child in self._get_children(node, contexts=True):
  132. if obj is child:
  133. if not context:
  134. context = self
  135. return context, mkslice(context.index(child))
  136. raise ValueError(obj)
  137. raise TypeError(obj)
  138. def _do_weak_search(self, obj, recursive):
  139. """Search for an element that looks like *obj* within the node list.
  140. This follows the same rules as :meth:`_do_strong_search` with some
  141. differences. *obj* is treated as a string that might represent any
  142. :class:`.Node`, :class:`.Wikicode`, or combination of the two present
  143. in the node list. Thus, matching is weak (using string comparisons)
  144. rather than strong (using ``is``). Because multiple nodes can match
  145. *obj*, the result is a list of tuples instead of just one (however,
  146. :exc:`ValueError` is still raised if nothing is found). Individual
  147. matches will never overlap.
  148. The tuples contain a new first element, *exact*, which is ``True`` if
  149. we were able to match *obj* exactly to one or more adjacent nodes, or
  150. ``False`` if we found *obj* inside a node or incompletely spanning
  151. multiple nodes.
  152. """
  153. obj = parse_anything(obj)
  154. if not obj or obj not in self:
  155. raise ValueError(obj)
  156. results = []
  157. contexts = [self]
  158. while contexts:
  159. context = contexts.pop()
  160. i = len(context.nodes) - 1
  161. while i >= 0:
  162. node = context.get(i)
  163. if obj.get(-1) == node:
  164. for j in range(-len(obj.nodes), -1):
  165. if obj.get(j) != context.get(i + j + 1):
  166. break
  167. else:
  168. i -= len(obj.nodes) - 1
  169. index = slice(i, i + len(obj.nodes))
  170. results.append((True, context, index))
  171. elif recursive and obj in node:
  172. contexts.extend(node.__children__())
  173. i -= 1
  174. if not results:
  175. if not recursive:
  176. raise ValueError(obj)
  177. results.append((False, self, slice(0, len(self.nodes))))
  178. return results
  179. def _get_tree(self, code, lines, marker, indent):
  180. """Build a tree to illustrate the way the Wikicode object was parsed.
  181. The method that builds the actual tree is ``__showtree__`` of ``Node``
  182. objects. *code* is the ``Wikicode`` object to build a tree for. *lines*
  183. is the list to append the tree to, which is returned at the end of the
  184. method. *marker* is some object to be used to indicate that the builder
  185. should continue on from the last line instead of starting a new one; it
  186. should be any object that can be tested for with ``is``. *indent* is
  187. the starting indentation.
  188. """
  189. def write(*args):
  190. """Write a new line following the proper indentation rules."""
  191. if lines and lines[-1] is marker: # Continue from the last line
  192. lines.pop() # Remove the marker
  193. last = lines.pop()
  194. lines.append(last + " ".join(args))
  195. else:
  196. lines.append(" " * 6 * indent + " ".join(args))
  197. get = lambda code: self._get_tree(code, lines, marker, indent + 1)
  198. mark = lambda: lines.append(marker)
  199. for node in code.nodes:
  200. node.__showtree__(write, get, mark)
  201. return lines
  202. @classmethod
  203. def _build_filter_methods(cls, **meths):
  204. """Given Node types, build the corresponding i?filter shortcuts.
  205. The should be given as keys storing the method's base name paired with
  206. values storing the corresponding :class:`.Node` type. For example, the
  207. dict may contain the pair ``("templates", Template)``, which will
  208. produce the methods :meth:`ifilter_templates` and
  209. :meth:`filter_templates`, which are shortcuts for
  210. :meth:`ifilter(forcetype=Template) <ifilter>` and
  211. :meth:`filter(forcetype=Template) <filter>`, respectively. These
  212. shortcuts are added to the class itself, with an appropriate docstring.
  213. """
  214. doc = """Iterate over {0}.
  215. This is equivalent to :meth:`{1}` with *forcetype* set to
  216. :class:`~{2.__module__}.{2.__name__}`.
  217. """
  218. make_ifilter = lambda ftype: (lambda self, *a, **kw:
  219. self.ifilter(forcetype=ftype, *a, **kw))
  220. make_filter = lambda ftype: (lambda self, *a, **kw:
  221. self.filter(forcetype=ftype, *a, **kw))
  222. for name, ftype in meths.items():
  223. ifilt = make_ifilter(ftype)
  224. filt = make_filter(ftype)
  225. ifilt.__doc__ = doc.format(name, "ifilter", ftype)
  226. filt.__doc__ = doc.format(name, "filter", ftype)
  227. setattr(cls, "ifilter_" + name, ifilt)
  228. setattr(cls, "filter_" + name, filt)
  229. @property
  230. def nodes(self):
  231. """A list of :class:`.Node` objects.
  232. This is the internal data actually stored within a :class:`.Wikicode`
  233. object.
  234. """
  235. return self._nodes
  236. @nodes.setter
  237. def nodes(self, value):
  238. if not isinstance(value, list):
  239. value = parse_anything(value).nodes
  240. self._nodes = value
  241. def get(self, index):
  242. """Return the *index*\\ th node within the list of nodes."""
  243. return self.nodes[index]
  244. def set(self, index, value):
  245. """Set the ``Node`` at *index* to *value*.
  246. Raises :exc:`IndexError` if *index* is out of range, or
  247. :exc:`ValueError` if *value* cannot be coerced into one :class:`.Node`.
  248. To insert multiple nodes at an index, use :meth:`get` with either
  249. :meth:`remove` and :meth:`insert` or :meth:`replace`.
  250. """
  251. nodes = parse_anything(value).nodes
  252. if len(nodes) > 1:
  253. raise ValueError("Cannot coerce multiple nodes into one index")
  254. if index >= len(self.nodes) or -1 * index > len(self.nodes):
  255. raise IndexError("List assignment index out of range")
  256. if nodes:
  257. self.nodes[index] = nodes[0]
  258. else:
  259. self.nodes.pop(index)
  260. def contains(self, obj):
  261. """Return whether this Wikicode object contains *obj*.
  262. If *obj* is a :class:`.Node` or :class:`.Wikicode` object, then we
  263. search for it exactly among all of our children, recursively.
  264. Otherwise, this method just uses :meth:`.__contains__` on the string.
  265. """
  266. if not isinstance(obj, (Node, Wikicode)):
  267. return obj in self
  268. try:
  269. self._do_strong_search(obj, recursive=True)
  270. except ValueError:
  271. return False
  272. return True
  273. def index(self, obj, recursive=False):
  274. """Return the index of *obj* in the list of nodes.
  275. Raises :exc:`ValueError` if *obj* is not found. If *recursive* is
  276. ``True``, we will look in all nodes of ours and their descendants, and
  277. return the index of our direct descendant node within *our* list of
  278. nodes. Otherwise, the lookup is done only on direct descendants.
  279. """
  280. strict = isinstance(obj, Node)
  281. equivalent = (lambda o, n: o is n) if strict else (lambda o, n: o == n)
  282. for i, node in enumerate(self.nodes):
  283. if recursive:
  284. for child in self._get_children(node):
  285. if equivalent(obj, child):
  286. return i
  287. elif equivalent(obj, node):
  288. return i
  289. raise ValueError(obj)
  290. def get_ancestors(self, obj):
  291. """Return a list of all ancestor nodes of the :class:`.Node` *obj*.
  292. The list is ordered from the most shallow ancestor (greatest great-
  293. grandparent) to the direct parent. The node itself is not included in
  294. the list. For example::
  295. >>> text = "{{a|{{b|{{c|{{d}}}}}}}}"
  296. >>> code = mwparserfromhell.parse(text)
  297. >>> node = code.filter_templates(matches=lambda n: n == "{{d}}")[0]
  298. >>> code.get_ancestors(node)
  299. ['{{a|{{b|{{c|{{d}}}}}}}}', '{{b|{{c|{{d}}}}}}', '{{c|{{d}}}}']
  300. Will return an empty list if *obj* is at the top level of this Wikicode
  301. object. Will raise :exc:`ValueError` if it wasn't found.
  302. """
  303. def _get_ancestors(code, needle):
  304. for node in code.nodes:
  305. if node is needle:
  306. return []
  307. for code in node.__children__():
  308. ancestors = _get_ancestors(code, needle)
  309. if ancestors is not None:
  310. return [node] + ancestors
  311. return None
  312. if isinstance(obj, Wikicode):
  313. obj = obj.get(0)
  314. elif not isinstance(obj, Node):
  315. raise ValueError(obj)
  316. ancestors = _get_ancestors(self, obj)
  317. if ancestors is None:
  318. raise ValueError(obj)
  319. return ancestors
  320. def get_parent(self, obj):
  321. """Return the direct parent node of the :class:`.Node` *obj*.
  322. This function is equivalent to calling :meth:`.get_ancestors` and
  323. taking the last element of the resulting list. Will return None if
  324. the node exists but does not have a parent; i.e., it is at the top
  325. level of the Wikicode object.
  326. """
  327. ancestors = self.get_ancestors(obj)
  328. return ancestors[-1] if ancestors else None
  329. def insert(self, index, value):
  330. """Insert *value* at *index* in the list of nodes.
  331. *value* can be anything parsable by :func:`.parse_anything`, which
  332. includes strings or other :class:`.Wikicode` or :class:`.Node` objects.
  333. """
  334. nodes = parse_anything(value).nodes
  335. for node in reversed(nodes):
  336. self.nodes.insert(index, node)
  337. def insert_before(self, obj, value, recursive=True):
  338. """Insert *value* immediately before *obj*.
  339. *obj* can be either a string, a :class:`.Node`, or another
  340. :class:`.Wikicode` object (as created by :meth:`get_sections`, for
  341. example). If *obj* is a string, we will operate on all instances of
  342. that string within the code, otherwise only on the specific instance
  343. given. *value* can be anything parsable by :func:`.parse_anything`. If
  344. *recursive* is ``True``, we will try to find *obj* within our child
  345. nodes even if it is not a direct descendant of this :class:`.Wikicode`
  346. object. If *obj* is not found, :exc:`ValueError` is raised.
  347. """
  348. if isinstance(obj, (Node, Wikicode)):
  349. context, index = self._do_strong_search(obj, recursive)
  350. context.insert(index.start, value)
  351. else:
  352. for exact, context, index in self._do_weak_search(obj, recursive):
  353. if exact:
  354. context.insert(index.start, value)
  355. else:
  356. obj = str(obj)
  357. self._slice_replace(context, index, obj, str(value) + obj)
  358. def insert_after(self, obj, value, recursive=True):
  359. """Insert *value* immediately after *obj*.
  360. *obj* can be either a string, a :class:`.Node`, or another
  361. :class:`.Wikicode` object (as created by :meth:`get_sections`, for
  362. example). If *obj* is a string, we will operate on all instances of
  363. that string within the code, otherwise only on the specific instance
  364. given. *value* can be anything parsable by :func:`.parse_anything`. If
  365. *recursive* is ``True``, we will try to find *obj* within our child
  366. nodes even if it is not a direct descendant of this :class:`.Wikicode`
  367. object. If *obj* is not found, :exc:`ValueError` is raised.
  368. """
  369. if isinstance(obj, (Node, Wikicode)):
  370. context, index = self._do_strong_search(obj, recursive)
  371. context.insert(index.stop, value)
  372. else:
  373. for exact, context, index in self._do_weak_search(obj, recursive):
  374. if exact:
  375. context.insert(index.stop, value)
  376. else:
  377. obj = str(obj)
  378. self._slice_replace(context, index, obj, obj + str(value))
  379. def replace(self, obj, value, recursive=True):
  380. """Replace *obj* with *value*.
  381. *obj* can be either a string, a :class:`.Node`, or another
  382. :class:`.Wikicode` object (as created by :meth:`get_sections`, for
  383. example). If *obj* is a string, we will operate on all instances of
  384. that string within the code, otherwise only on the specific instance
  385. given. *value* can be anything parsable by :func:`.parse_anything`.
  386. If *recursive* is ``True``, we will try to find *obj* within our child
  387. nodes even if it is not a direct descendant of this :class:`.Wikicode`
  388. object. If *obj* is not found, :exc:`ValueError` is raised.
  389. """
  390. if isinstance(obj, (Node, Wikicode)):
  391. context, index = self._do_strong_search(obj, recursive)
  392. for _ in range(index.start, index.stop):
  393. context.nodes.pop(index.start)
  394. context.insert(index.start, value)
  395. else:
  396. for exact, context, index in self._do_weak_search(obj, recursive):
  397. if exact:
  398. for _ in range(index.start, index.stop):
  399. context.nodes.pop(index.start)
  400. context.insert(index.start, value)
  401. else:
  402. self._slice_replace(context, index, str(obj), str(value))
  403. def append(self, value):
  404. """Insert *value* at the end of the list of nodes.
  405. *value* can be anything parsable by :func:`.parse_anything`.
  406. """
  407. nodes = parse_anything(value).nodes
  408. for node in nodes:
  409. self.nodes.append(node)
  410. def remove(self, obj, recursive=True):
  411. """Remove *obj* from the list of nodes.
  412. *obj* can be either a string, a :class:`.Node`, or another
  413. :class:`.Wikicode` object (as created by :meth:`get_sections`, for
  414. example). If *obj* is a string, we will operate on all instances of
  415. that string within the code, otherwise only on the specific instance
  416. given. If *recursive* is ``True``, we will try to find *obj* within our
  417. child nodes even if it is not a direct descendant of this
  418. :class:`.Wikicode` object. If *obj* is not found, :exc:`ValueError` is
  419. raised.
  420. """
  421. if isinstance(obj, (Node, Wikicode)):
  422. context, index = self._do_strong_search(obj, recursive)
  423. for _ in range(index.start, index.stop):
  424. context.nodes.pop(index.start)
  425. else:
  426. for exact, context, index in self._do_weak_search(obj, recursive):
  427. if exact:
  428. for _ in range(index.start, index.stop):
  429. context.nodes.pop(index.start)
  430. else:
  431. self._slice_replace(context, index, str(obj), "")
  432. def matches(self, other):
  433. """Do a loose equivalency test suitable for comparing page names.
  434. *other* can be any string-like object, including :class:`.Wikicode`, or
  435. an iterable of these. This operation is symmetric; both sides are
  436. adjusted. Specifically, whitespace and markup is stripped and the first
  437. letter's case is normalized. Typical usage is
  438. ``if template.name.matches("stub"): ...``.
  439. """
  440. normalize = lambda s: (s[0].upper() + s[1:]).replace("_", " ") if s else s
  441. this = normalize(self.strip_code().strip())
  442. if isinstance(other, (str, bytes, Wikicode, Node)):
  443. that = parse_anything(other).strip_code().strip()
  444. return this == normalize(that)
  445. for obj in other:
  446. that = parse_anything(obj).strip_code().strip()
  447. if this == normalize(that):
  448. return True
  449. return False
  450. def ifilter(self, recursive=True, matches=None, flags=FLAGS,
  451. forcetype=None):
  452. """Iterate over nodes in our list matching certain conditions.
  453. If *forcetype* is given, only nodes that are instances of this type (or
  454. tuple of types) are yielded. Setting *recursive* to ``True`` will
  455. iterate over all children and their descendants. ``RECURSE_OTHERS``
  456. will only iterate over children that are not the instances of
  457. *forcetype*. ``False`` will only iterate over immediate children.
  458. ``RECURSE_OTHERS`` can be used to iterate over all un-nested templates,
  459. even if they are inside of HTML tags, like so:
  460. >>> code = mwparserfromhell.parse("{{foo}}<b>{{foo|{{bar}}}}</b>")
  461. >>> code.filter_templates(code.RECURSE_OTHERS)
  462. ["{{foo}}", "{{foo|{{bar}}}}"]
  463. *matches* can be used to further restrict the nodes, either as a
  464. function (taking a single :class:`.Node` and returning a boolean) or a
  465. regular expression (matched against the node's string representation
  466. with :func:`re.search`). If *matches* is a regex, the flags passed to
  467. :func:`re.search` are :const:`re.IGNORECASE`, :const:`re.DOTALL`, and
  468. :const:`re.UNICODE`, but custom flags can be specified by passing
  469. *flags*.
  470. """
  471. gen = self._indexed_ifilter(recursive, matches, flags, forcetype)
  472. return (node for i, node in gen)
  473. def filter(self, *args, **kwargs):
  474. """Return a list of nodes within our list matching certain conditions.
  475. This is equivalent to calling :func:`list` on :meth:`ifilter`.
  476. """
  477. return list(self.ifilter(*args, **kwargs))
  478. def get_sections(self, levels=None, matches=None, flags=FLAGS, flat=False,
  479. include_lead=None, include_headings=True):
  480. """Return a list of sections within the page.
  481. Sections are returned as :class:`.Wikicode` objects with a shared node
  482. list (implemented using :class:`.SmartList`) so that changes to
  483. sections are reflected in the parent Wikicode object.
  484. Each section contains all of its subsections, unless *flat* is
  485. ``True``. If *levels* is given, it should be a iterable of integers;
  486. only sections whose heading levels are within it will be returned. If
  487. *matches* is given, it should be either a function or a regex; only
  488. sections whose headings match it (without the surrounding equal signs)
  489. will be included. *flags* can be used to override the default regex
  490. flags (see :meth:`ifilter`) if a regex *matches* is used.
  491. If *include_lead* is ``True``, the first, lead section (without a
  492. heading) will be included in the list; ``False`` will not include it;
  493. the default will include it only if no specific *levels* were given. If
  494. *include_headings* is ``True``, the section's beginning
  495. :class:`.Heading` object will be included; otherwise, this is skipped.
  496. """
  497. title_matcher = self._build_matcher(matches, flags)
  498. matcher = lambda heading: (title_matcher(heading.title) and
  499. (not levels or heading.level in levels))
  500. iheadings = self._indexed_ifilter(recursive=False, forcetype=Heading)
  501. sections = [] # Tuples of (index_of_first_node, section)
  502. open_headings = [] # Tuples of (index, heading), where index and
  503. # heading.level are both monotonically increasing
  504. # Add the lead section if appropriate:
  505. if include_lead or not (include_lead is not None or matches or levels):
  506. itr = self._indexed_ifilter(recursive=False, forcetype=Heading)
  507. try:
  508. first = next(itr)[0]
  509. sections.append((0, Wikicode(self.nodes[:first])))
  510. except StopIteration: # No headings in page
  511. sections.append((0, Wikicode(self.nodes[:])))
  512. # Iterate over headings, adding sections to the list as they end:
  513. for i, heading in iheadings:
  514. if flat: # With flat, all sections close at the next heading
  515. newly_closed, open_headings = open_headings, []
  516. else: # Otherwise, figure out which sections have closed, if any
  517. closed_start_index = len(open_headings)
  518. for j, (start, last_heading) in enumerate(open_headings):
  519. if heading.level <= last_heading.level:
  520. closed_start_index = j
  521. break
  522. newly_closed = open_headings[closed_start_index:]
  523. del open_headings[closed_start_index:]
  524. for start, closed_heading in newly_closed:
  525. if matcher(closed_heading):
  526. sections.append((start, Wikicode(self.nodes[start:i])))
  527. start = i if include_headings else (i + 1)
  528. open_headings.append((start, heading))
  529. # Add any remaining open headings to the list of sections:
  530. for start, heading in open_headings:
  531. if matcher(heading):
  532. sections.append((start, Wikicode(self.nodes[start:])))
  533. # Ensure that earlier sections are earlier in the returned list:
  534. return [section for i, section in sorted(sections)]
  535. def strip_code(self, normalize=True, collapse=True,
  536. keep_template_params=False):
  537. """Return a rendered string without unprintable code such as templates.
  538. The way a node is stripped is handled by the
  539. :meth:`~.Node.__strip__` method of :class:`.Node` objects, which
  540. generally return a subset of their nodes or ``None``. For example,
  541. templates and tags are removed completely, links are stripped to just
  542. their display part, headings are stripped to just their title.
  543. If *normalize* is ``True``, various things may be done to strip code
  544. further, such as converting HTML entities like ``&Sigma;``, ``&#931;``,
  545. and ``&#x3a3;`` to ``Σ``. If *collapse* is ``True``, we will try to
  546. remove excess whitespace as well (three or more newlines are converted
  547. to two, for example). If *keep_template_params* is ``True``, then
  548. template parameters will be preserved in the output (normally, they are
  549. removed completely).
  550. """
  551. kwargs = {
  552. "normalize": normalize,
  553. "collapse": collapse,
  554. "keep_template_params": keep_template_params
  555. }
  556. nodes = []
  557. for node in self.nodes:
  558. stripped = node.__strip__(**kwargs)
  559. if stripped:
  560. nodes.append(str(stripped))
  561. if collapse:
  562. stripped = "".join(nodes).strip("\n")
  563. while "\n\n\n" in stripped:
  564. stripped = stripped.replace("\n\n\n", "\n\n")
  565. return stripped
  566. return "".join(nodes)
  567. def get_tree(self):
  568. """Return a hierarchical tree representation of the object.
  569. The representation is a string makes the most sense printed. It is
  570. built by calling :meth:`_get_tree` on the :class:`.Wikicode` object and
  571. its children recursively. The end result may look something like the
  572. following::
  573. >>> text = "Lorem ipsum {{foo|bar|{{baz}}|spam=eggs}}"
  574. >>> print(mwparserfromhell.parse(text).get_tree())
  575. Lorem ipsum
  576. {{
  577. foo
  578. | 1
  579. = bar
  580. | 2
  581. = {{
  582. baz
  583. }}
  584. | spam
  585. = eggs
  586. }}
  587. """
  588. marker = object() # Random object we can find with certainty in a list
  589. return "\n".join(self._get_tree(self, [], marker, 0))
  590. Wikicode._build_filter_methods(
  591. arguments=Argument, comments=Comment, external_links=ExternalLink,
  592. headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template,
  593. text=Text, wikilinks=Wikilink)