A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. import re
  24. from .compat import maxsize, py3k, str
  25. from .nodes import (Argument, Comment, Heading, HTMLEntity, Node, Tag,
  26. Template, Text, Wikilink)
  27. from .string_mixin import StringMixIn
  28. from .utils import parse_anything
  29. __all__ = ["Wikicode"]
  30. FLAGS = re.IGNORECASE | re.DOTALL | re.UNICODE
  31. class Wikicode(StringMixIn):
  32. """A ``Wikicode`` is a container for nodes that operates like a string.
  33. Additionally, it contains methods that can be used to extract data from or
  34. modify the nodes, implemented in an interface similar to a list. For
  35. example, :py:meth:`index` can get the index of a node in the list, and
  36. :py:meth:`insert` can add a new node at that index. The :py:meth:`filter()
  37. <ifilter>` series of functions is very useful for extracting and iterating
  38. over, for example, all of the templates in the object.
  39. """
  40. def __init__(self, nodes):
  41. super(Wikicode, self).__init__()
  42. self._nodes = nodes
  43. def __unicode__(self):
  44. return "".join([str(node) for node in self.nodes])
  45. def _get_children(self, node):
  46. """Iterate over all descendants of a given *node*, including itself.
  47. This is implemented by the ``__iternodes__()`` generator of ``Node``
  48. classes, which by default yields itself and nothing more.
  49. """
  50. for context, child in node.__iternodes__(self._get_all_nodes):
  51. yield child
  52. def _get_context(self, node, obj):
  53. """Return a ``Wikicode`` that contains *obj* in its descendants.
  54. The closest (shortest distance from *node*) suitable ``Wikicode`` will
  55. be returned, or ``None`` if the *obj* is the *node* itself.
  56. Raises ``ValueError`` if *obj* is not within *node*.
  57. """
  58. for context, child in node.__iternodes__(self._get_all_nodes):
  59. if self._is_equivalent(obj, child):
  60. return context
  61. raise ValueError(obj)
  62. def _get_all_nodes(self, code):
  63. """Iterate over all of our descendant nodes.
  64. This is implemented by calling :py:meth:`_get_children` on every node
  65. in our node list (:py:attr:`self.nodes <nodes>`).
  66. """
  67. for node in code.nodes:
  68. for child in self._get_children(node):
  69. yield child
  70. def _is_equivalent(self, obj, node):
  71. """Return ``True`` if *obj* and *node* are equivalent, else ``False``.
  72. If *obj* is a ``Node``, the function will test whether they are the
  73. same object, otherwise it will compare them with ``==``.
  74. """
  75. return (node is obj) if isinstance(obj, Node) else (node == obj)
  76. def _contains(self, nodes, obj):
  77. """Return ``True`` if *obj* is inside of *nodes*, else ``False``.
  78. If *obj* is a ``Node``, we will only return ``True`` if *obj* is
  79. actually in the list (and not just a node that equals it). Otherwise,
  80. the test is simply ``obj in nodes``.
  81. """
  82. if isinstance(obj, Node):
  83. for node in nodes:
  84. if node is obj:
  85. return True
  86. return False
  87. return obj in nodes
  88. def _do_search(self, obj, recursive, callback, context, *args, **kwargs):
  89. """Look within *context* for *obj*, executing *callback* if found.
  90. If *recursive* is ``True``, we'll look within context and its
  91. descendants, otherwise we'll just execute callback. We raise
  92. :py:exc:`ValueError` if *obj* isn't in our node list or context. If
  93. found, *callback* is passed the context, the index of the node within
  94. the context, and whatever were passed as ``*args`` and ``**kwargs``.
  95. """
  96. if recursive:
  97. for i, node in enumerate(context.nodes):
  98. if self._is_equivalent(obj, node):
  99. return callback(context, i, *args, **kwargs)
  100. if self._contains(self._get_children(node), obj):
  101. context = self._get_context(node, obj)
  102. return self._do_search(obj, recursive, callback, context,
  103. *args, **kwargs)
  104. raise ValueError(obj)
  105. callback(context, self.index(obj, recursive=False), *args, **kwargs)
  106. def _get_tree(self, code, lines, marker, indent):
  107. """Build a tree to illustrate the way the Wikicode object was parsed.
  108. The method that builds the actual tree is ``__showtree__`` of ``Node``
  109. objects. *code* is the ``Wikicode`` object to build a tree for. *lines*
  110. is the list to append the tree to, which is returned at the end of the
  111. method. *marker* is some object to be used to indicate that the builder
  112. should continue on from the last line instead of starting a new one; it
  113. should be any object that can be tested for with ``is``. *indent* is
  114. the starting indentation.
  115. """
  116. def write(*args):
  117. """Write a new line following the proper indentation rules."""
  118. if lines and lines[-1] is marker: # Continue from the last line
  119. lines.pop() # Remove the marker
  120. last = lines.pop()
  121. lines.append(last + " ".join(args))
  122. else:
  123. lines.append(" " * 6 * indent + " ".join(args))
  124. get = lambda code: self._get_tree(code, lines, marker, indent + 1)
  125. mark = lambda: lines.append(marker)
  126. for node in code.nodes:
  127. node.__showtree__(write, get, mark)
  128. return lines
  129. @classmethod
  130. def _build_filter_methods(cls, **meths):
  131. """Given Node types, build the corresponding i?filter shortcuts.
  132. The should be given as keys storing the method's base name paired
  133. with values storing the corresponding :py:class:`~.Node` type. For
  134. example, the dict may contain the pair ``("templates", Template)``,
  135. which will produce the methods :py:meth:`ifilter_templates` and
  136. :py:meth:`filter_templates`, which are shortcuts for
  137. :py:meth:`ifilter(forcetype=Template) <ifilter>` and
  138. :py:meth:`filter(forcetype=Template) <filter>`, respectively. These
  139. shortcuts are added to the class itself, with an appropriate docstring.
  140. """
  141. doc = """Iterate over {0}.
  142. This is equivalent to :py:meth:`{1}` with *forcetype* set to
  143. :py:class:`~{2.__module__}.{2.__name__}`.
  144. """
  145. make_ifilter = lambda ftype: (lambda self, **kw:
  146. self.ifilter(forcetype=ftype, **kw))
  147. make_filter = lambda ftype: (lambda self, **kw:
  148. self.filter(forcetype=ftype, **kw))
  149. for name, ftype in (meths.items() if py3k else meths.iteritems()):
  150. ifilter = make_ifilter(ftype)
  151. filter = make_filter(ftype)
  152. ifilter.__doc__ = doc.format(name, "ifilter", ftype)
  153. filter.__doc__ = doc.format(name, "filter", ftype)
  154. setattr(cls, "ifilter_" + name, ifilter)
  155. setattr(cls, "filter_" + name, filter)
  156. @property
  157. def nodes(self):
  158. """A list of :py:class:`~.Node` objects.
  159. This is the internal data actually stored within a
  160. :py:class:`~.Wikicode` object.
  161. """
  162. return self._nodes
  163. @nodes.setter
  164. def nodes(self, value):
  165. if not isinstance(value, list):
  166. value = parse_anything(value).nodes
  167. self._nodes = value
  168. def get(self, index):
  169. """Return the *index*\ th node within the list of nodes."""
  170. return self.nodes[index]
  171. def set(self, index, value):
  172. """Set the ``Node`` at *index* to *value*.
  173. Raises :py:exc:`IndexError` if *index* is out of range, or
  174. :py:exc:`ValueError` if *value* cannot be coerced into one
  175. :py:class:`~.Node`. To insert multiple nodes at an index, use
  176. :py:meth:`get` with either :py:meth:`remove` and :py:meth:`insert` or
  177. :py:meth:`replace`.
  178. """
  179. nodes = parse_anything(value).nodes
  180. if len(nodes) > 1:
  181. raise ValueError("Cannot coerce multiple nodes into one index")
  182. if index >= len(self.nodes) or -1 * index > len(self.nodes):
  183. raise IndexError("List assignment index out of range")
  184. if nodes:
  185. self.nodes[index] = nodes[0]
  186. else:
  187. self.nodes.pop(index)
  188. def index(self, obj, recursive=False):
  189. """Return the index of *obj* in the list of nodes.
  190. Raises :py:exc:`ValueError` if *obj* is not found. If *recursive* is
  191. ``True``, we will look in all nodes of ours and their descendants, and
  192. return the index of our direct descendant node within *our* list of
  193. nodes. Otherwise, the lookup is done only on direct descendants.
  194. """
  195. if recursive:
  196. for i, node in enumerate(self.nodes):
  197. if self._contains(self._get_children(node), obj):
  198. return i
  199. raise ValueError(obj)
  200. for i, node in enumerate(self.nodes):
  201. if self._is_equivalent(obj, node):
  202. return i
  203. raise ValueError(obj)
  204. def insert(self, index, value):
  205. """Insert *value* at *index* in the list of nodes.
  206. *value* can be anything parasable by :py:func:`.parse_anything`, which
  207. includes strings or other :py:class:`~.Wikicode` or :py:class:`~.Node`
  208. objects.
  209. """
  210. nodes = parse_anything(value).nodes
  211. for node in reversed(nodes):
  212. self.nodes.insert(index, node)
  213. def insert_before(self, obj, value, recursive=True):
  214. """Insert *value* immediately before *obj* in the list of nodes.
  215. *obj* can be either a string or a :py:class:`~.Node`. *value* can be
  216. anything parasable by :py:func:`.parse_anything`. If *recursive* is
  217. ``True``, we will try to find *obj* within our child nodes even if it
  218. is not a direct descendant of this :py:class:`~.Wikicode` object. If
  219. *obj* is not in the node list, :py:exc:`ValueError` is raised.
  220. """
  221. callback = lambda self, i, value: self.insert(i, value)
  222. self._do_search(obj, recursive, callback, self, value)
  223. def insert_after(self, obj, value, recursive=True):
  224. """Insert *value* immediately after *obj* in the list of nodes.
  225. *obj* can be either a string or a :py:class:`~.Node`. *value* can be
  226. anything parasable by :py:func:`.parse_anything`. If *recursive* is
  227. ``True``, we will try to find *obj* within our child nodes even if it
  228. is not a direct descendant of this :py:class:`~.Wikicode` object. If
  229. *obj* is not in the node list, :py:exc:`ValueError` is raised.
  230. """
  231. callback = lambda self, i, value: self.insert(i + 1, value)
  232. self._do_search(obj, recursive, callback, self, value)
  233. def replace(self, obj, value, recursive=True):
  234. """Replace *obj* with *value* in the list of nodes.
  235. *obj* can be either a string or a :py:class:`~.Node`. *value* can be
  236. anything parasable by :py:func:`.parse_anything`. If *recursive* is
  237. ``True``, we will try to find *obj* within our child nodes even if it
  238. is not a direct descendant of this :py:class:`~.Wikicode` object. If
  239. *obj* is not in the node list, :py:exc:`ValueError` is raised.
  240. """
  241. def callback(self, i, value):
  242. self.nodes.pop(i)
  243. self.insert(i, value)
  244. self._do_search(obj, recursive, callback, self, value)
  245. def append(self, value):
  246. """Insert *value* at the end of the list of nodes.
  247. *value* can be anything parasable by :py:func:`.parse_anything`.
  248. """
  249. nodes = parse_anything(value).nodes
  250. for node in nodes:
  251. self.nodes.append(node)
  252. def remove(self, obj, recursive=True):
  253. """Remove *obj* from the list of nodes.
  254. *obj* can be either a string or a :py:class:`~.Node`. If *recursive* is
  255. ``True``, we will try to find *obj* within our child nodes even if it
  256. is not a direct descendant of this :py:class:`~.Wikicode` object. If
  257. *obj* is not in the node list, :py:exc:`ValueError` is raised.
  258. """
  259. callback = lambda self, i: self.nodes.pop(i)
  260. self._do_search(obj, recursive, callback, self)
  261. def ifilter(self, recursive=True, matches=None, flags=FLAGS,
  262. forcetype=None):
  263. """Iterate over nodes in our list matching certain conditions.
  264. If *recursive* is ``True``, we will iterate over our children and all
  265. descendants of our children, otherwise just our immediate children. If
  266. *matches* is given, we will only yield the nodes that match the given
  267. regular expression (with :py:func:`re.search`). The default flags used
  268. are :py:const:`re.IGNORECASE`, :py:const:`re.DOTALL`, and
  269. :py:const:`re.UNICODE`, but custom flags can be specified by passing
  270. *flags*. If *forcetype* is given, only nodes that are instances of this
  271. type are yielded.
  272. """
  273. for node in (self._get_all_nodes(self) if recursive else self.nodes):
  274. if not forcetype or isinstance(node, forcetype):
  275. if not matches or re.search(matches, str(node), flags):
  276. yield node
  277. def filter(self, recursive=True, matches=None, flags=FLAGS,
  278. forcetype=None):
  279. """Return a list of nodes within our list matching certain conditions.
  280. This is equivalent to calling :py:func:`list` on :py:meth:`ifilter`.
  281. """
  282. return list(self.ifilter(recursive, matches, flags, forcetype))
  283. def get_sections(self, levels=None, matches=None, flags=FLAGS,
  284. include_lead=None, include_headings=True):
  285. """Return a list of sections within the page.
  286. Sections are returned as :py:class:`~.Wikicode` objects with a shared
  287. node list (implemented using :py:class:`~.SmartList`) so that changes
  288. to sections are reflected in the parent Wikicode object.
  289. Each section contains all of its subsections. If *levels* is given, it
  290. should be a iterable of integers; only sections whose heading levels
  291. are within it will be returned.If *matches* is given, it should be a
  292. regex to be matched against the titles of section headings; only
  293. sections whose headings match the regex will be included. *flags* can
  294. be used to override the default regex flags (see :py:meth:`ifilter`) if
  295. *matches* is used.
  296. If *include_lead* is ``True``, the first, lead section (without a
  297. heading) will be included in the list; ``False`` will not include it;
  298. the default will include it only if no specific *levels* were given. If
  299. *include_headings* is ``True``, the section's beginning
  300. :py:class:`~.Heading` object will be included; otherwise, this is
  301. skipped.
  302. """
  303. if matches:
  304. matches = r"^(=+?)\s*" + matches + r"\s*\1$"
  305. headings = self.filter_headings(recursive=True)
  306. filtered = self.filter_headings(recursive=True, matches=matches,
  307. flags=flags)
  308. if levels:
  309. filtered = [head for head in filtered if head.level in levels]
  310. if matches or include_lead is False or (not include_lead and levels):
  311. buffers = []
  312. else:
  313. buffers = [(maxsize, 0)]
  314. sections = []
  315. i = 0
  316. while i < len(self.nodes):
  317. if self.nodes[i] in headings:
  318. this = self.nodes[i].level
  319. for (level, start) in buffers:
  320. if this <= level:
  321. sections.append(Wikicode(self.nodes[start:i]))
  322. buffers = [buf for buf in buffers if buf[0] < this]
  323. if self.nodes[i] in filtered:
  324. if not include_headings:
  325. i += 1
  326. if i >= len(self.nodes):
  327. break
  328. buffers.append((this, i))
  329. i += 1
  330. for (level, start) in buffers:
  331. if start != i:
  332. sections.append(Wikicode(self.nodes[start:i]))
  333. return sections
  334. def strip_code(self, normalize=True, collapse=True):
  335. """Return a rendered string without unprintable code such as templates.
  336. The way a node is stripped is handled by the
  337. :py:meth:`~.Node.__showtree__` method of :py:class:`~.Node` objects,
  338. which generally return a subset of their nodes or ``None``. For
  339. example, templates and tags are removed completely, links are stripped
  340. to just their display part, headings are stripped to just their title.
  341. If *normalize* is ``True``, various things may be done to strip code
  342. further, such as converting HTML entities like ``&Sigma;``, ``&#931;``,
  343. and ``&#x3a3;`` to ``Σ``. If *collapse* is ``True``, we will try to
  344. remove excess whitespace as well (three or more newlines are converted
  345. to two, for example).
  346. """
  347. nodes = []
  348. for node in self.nodes:
  349. stripped = node.__strip__(normalize, collapse)
  350. if stripped:
  351. nodes.append(str(stripped))
  352. if collapse:
  353. stripped = "".join(nodes).strip("\n")
  354. while "\n\n\n" in stripped:
  355. stripped = stripped.replace("\n\n\n", "\n\n")
  356. return stripped
  357. else:
  358. return "".join(nodes)
  359. def get_tree(self):
  360. """Return a hierarchical tree representation of the object.
  361. The representation is a string makes the most sense printed. It is
  362. built by calling :py:meth:`_get_tree` on the
  363. :py:class:`~.Wikicode` object and its children recursively. The end
  364. result may look something like the following::
  365. >>> text = "Lorem ipsum {{foo|bar|{{baz}}|spam=eggs}}"
  366. >>> print mwparserfromhell.parse(text).get_tree()
  367. Lorem ipsum
  368. {{
  369. foo
  370. | 1
  371. = bar
  372. | 2
  373. = {{
  374. baz
  375. }}
  376. | spam
  377. = eggs
  378. }}
  379. """
  380. marker = object() # Random object we can find with certainty in a list
  381. return "\n".join(self._get_tree(self, [], marker, 0))
  382. Wikicode._build_filter_methods(
  383. arguments=Argument, comments=Comment, headings=Heading,
  384. html_entities=HTMLEntity, tags=Tag, templates=Template, text=Text,
  385. wikilinks=Wikilink)