A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

пре 11 година
пре 12 година
пре 11 година
пре 11 година
пре 11 година
пре 10 година
пре 10 година
пре 12 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 10 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 12 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 12 година
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. import re
  24. from .compat import maxsize, py3k, str
  25. from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity,
  26. Node, Tag, Template, Text, Wikilink)
  27. from .string_mixin import StringMixIn
  28. from .utils import parse_anything
  29. __all__ = ["Wikicode"]
  30. FLAGS = re.IGNORECASE | re.DOTALL | re.UNICODE
  31. class Wikicode(StringMixIn):
  32. """A ``Wikicode`` is a container for nodes that operates like a string.
  33. Additionally, it contains methods that can be used to extract data from or
  34. modify the nodes, implemented in an interface similar to a list. For
  35. example, :py:meth:`index` can get the index of a node in the list, and
  36. :py:meth:`insert` can add a new node at that index. The :py:meth:`filter()
  37. <ifilter>` series of functions is very useful for extracting and iterating
  38. over, for example, all of the templates in the object.
  39. """
  40. def __init__(self, nodes):
  41. super(Wikicode, self).__init__()
  42. self._nodes = nodes
  43. def __unicode__(self):
  44. return "".join([str(node) for node in self.nodes])
  45. def _get_children(self, node):
  46. """Iterate over all descendants of a given *node*, including itself.
  47. This is implemented by the ``__iternodes__()`` generator of ``Node``
  48. classes, which by default yields itself and nothing more.
  49. """
  50. for context, child in node.__iternodes__(self._get_all_nodes):
  51. yield child
  52. def _get_all_nodes(self, code):
  53. """Iterate over all of our descendant nodes.
  54. This is implemented by calling :py:meth:`_get_children` on every node
  55. in our node list (:py:attr:`self.nodes <nodes>`).
  56. """
  57. for node in code.nodes:
  58. for child in self._get_children(node):
  59. yield child
  60. def _is_equivalent(self, obj, node):
  61. """Return ``True`` if *obj* and *node* are equivalent, else ``False``.
  62. If *obj* is a ``Node``, the function will test whether they are the
  63. same object, otherwise it will compare them with ``==``.
  64. """
  65. return (node is obj) if isinstance(obj, Node) else (node == obj)
  66. def _contains(self, nodes, obj):
  67. """Return ``True`` if *obj* is inside of *nodes*, else ``False``.
  68. If *obj* is a ``Node``, we will only return ``True`` if *obj* is
  69. actually in the list (and not just a node that equals it). Otherwise,
  70. the test is simply ``obj in nodes``.
  71. """
  72. if isinstance(obj, Node):
  73. for node in nodes:
  74. if node is obj:
  75. return True
  76. return False
  77. return obj in nodes
  78. def _do_search(self, obj, recursive, context=None, literal=None):
  79. """Return some info about the location of *obj* within *context*.
  80. If *recursive* is ``True``, we'll look within *context* (``self`` by
  81. default) and its descendants, otherwise just *context*. We raise
  82. :py:exc:`ValueError` if *obj* isn't found. The return data is a list of
  83. 3-tuples (*type*, *context*, *data*) where *type* is *obj*\ 's best
  84. type resolution (either ``Node``, ``Wikicode``, or ``str``), *context*
  85. is the closest ``Wikicode`` encompassing it, and *data* is either a
  86. ``Node``, a list of ``Node``\ s, or ``None`` depending on *type*.
  87. """
  88. if not context:
  89. context = self
  90. literal = isinstance(obj, (Node, Wikicode))
  91. obj = parse_anything(obj)
  92. if not obj or obj not in self:
  93. raise ValueError(obj)
  94. if len(obj.nodes) == 1:
  95. obj = obj.get(0)
  96. compare = lambda a, b: (a is b) if literal else (a == b)
  97. results = []
  98. i = 0
  99. while i < len(context.nodes):
  100. node = context.get(i)
  101. if isinstance(obj, Node) and compare(obj, node):
  102. results.append((Node, context, node))
  103. elif isinstance(obj, Wikicode) and compare(obj.get(0), node):
  104. for j in range(1, len(obj.nodes)):
  105. if not compare(obj.get(j), context.get(i + j)):
  106. break
  107. else:
  108. nodes = list(context.nodes[i:i + len(obj.nodes)])
  109. results.append((Wikicode, context, nodes))
  110. i += len(obj.nodes) - 1
  111. elif recursive:
  112. contexts = node.__iternodes__(self._get_all_nodes)
  113. processed = []
  114. for code in (ctx for ctx, child in contexts):
  115. if code and code not in processed and obj in code:
  116. search = self._do_search(obj, recursive, code, literal)
  117. results.extend(search)
  118. processed.append(code)
  119. i += 1
  120. if not results and not literal and recursive:
  121. results.append((str, context, None))
  122. if not results and context is self:
  123. raise ValueError(obj)
  124. return results
  125. def _get_tree(self, code, lines, marker, indent):
  126. """Build a tree to illustrate the way the Wikicode object was parsed.
  127. The method that builds the actual tree is ``__showtree__`` of ``Node``
  128. objects. *code* is the ``Wikicode`` object to build a tree for. *lines*
  129. is the list to append the tree to, which is returned at the end of the
  130. method. *marker* is some object to be used to indicate that the builder
  131. should continue on from the last line instead of starting a new one; it
  132. should be any object that can be tested for with ``is``. *indent* is
  133. the starting indentation.
  134. """
  135. def write(*args):
  136. """Write a new line following the proper indentation rules."""
  137. if lines and lines[-1] is marker: # Continue from the last line
  138. lines.pop() # Remove the marker
  139. last = lines.pop()
  140. lines.append(last + " ".join(args))
  141. else:
  142. lines.append(" " * 6 * indent + " ".join(args))
  143. get = lambda code: self._get_tree(code, lines, marker, indent + 1)
  144. mark = lambda: lines.append(marker)
  145. for node in code.nodes:
  146. node.__showtree__(write, get, mark)
  147. return lines
  148. @classmethod
  149. def _build_filter_methods(cls, **meths):
  150. """Given Node types, build the corresponding i?filter shortcuts.
  151. The should be given as keys storing the method's base name paired
  152. with values storing the corresponding :py:class:`~.Node` type. For
  153. example, the dict may contain the pair ``("templates", Template)``,
  154. which will produce the methods :py:meth:`ifilter_templates` and
  155. :py:meth:`filter_templates`, which are shortcuts for
  156. :py:meth:`ifilter(forcetype=Template) <ifilter>` and
  157. :py:meth:`filter(forcetype=Template) <filter>`, respectively. These
  158. shortcuts are added to the class itself, with an appropriate docstring.
  159. """
  160. doc = """Iterate over {0}.
  161. This is equivalent to :py:meth:`{1}` with *forcetype* set to
  162. :py:class:`~{2.__module__}.{2.__name__}`.
  163. """
  164. make_ifilter = lambda ftype: (lambda self, **kw:
  165. self.ifilter(forcetype=ftype, **kw))
  166. make_filter = lambda ftype: (lambda self, **kw:
  167. self.filter(forcetype=ftype, **kw))
  168. for name, ftype in (meths.items() if py3k else meths.iteritems()):
  169. ifilter = make_ifilter(ftype)
  170. filter = make_filter(ftype)
  171. ifilter.__doc__ = doc.format(name, "ifilter", ftype)
  172. filter.__doc__ = doc.format(name, "filter", ftype)
  173. setattr(cls, "ifilter_" + name, ifilter)
  174. setattr(cls, "filter_" + name, filter)
  175. @property
  176. def nodes(self):
  177. """A list of :py:class:`~.Node` objects.
  178. This is the internal data actually stored within a
  179. :py:class:`~.Wikicode` object.
  180. """
  181. return self._nodes
  182. @nodes.setter
  183. def nodes(self, value):
  184. if not isinstance(value, list):
  185. value = parse_anything(value).nodes
  186. self._nodes = value
  187. def get(self, index):
  188. """Return the *index*\ th node within the list of nodes."""
  189. return self.nodes[index]
  190. def set(self, index, value):
  191. """Set the ``Node`` at *index* to *value*.
  192. Raises :py:exc:`IndexError` if *index* is out of range, or
  193. :py:exc:`ValueError` if *value* cannot be coerced into one
  194. :py:class:`~.Node`. To insert multiple nodes at an index, use
  195. :py:meth:`get` with either :py:meth:`remove` and :py:meth:`insert` or
  196. :py:meth:`replace`.
  197. """
  198. nodes = parse_anything(value).nodes
  199. if len(nodes) > 1:
  200. raise ValueError("Cannot coerce multiple nodes into one index")
  201. if index >= len(self.nodes) or -1 * index > len(self.nodes):
  202. raise IndexError("List assignment index out of range")
  203. if nodes:
  204. self.nodes[index] = nodes[0]
  205. else:
  206. self.nodes.pop(index)
  207. def index(self, obj, recursive=False):
  208. """Return the index of *obj* in the list of nodes.
  209. Raises :py:exc:`ValueError` if *obj* is not found. If *recursive* is
  210. ``True``, we will look in all nodes of ours and their descendants, and
  211. return the index of our direct descendant node within *our* list of
  212. nodes. Otherwise, the lookup is done only on direct descendants.
  213. """
  214. if recursive:
  215. for i, node in enumerate(self.nodes):
  216. if self._contains(self._get_children(node), obj):
  217. return i
  218. raise ValueError(obj)
  219. for i, node in enumerate(self.nodes):
  220. if self._is_equivalent(obj, node):
  221. return i
  222. raise ValueError(obj)
  223. def insert(self, index, value):
  224. """Insert *value* at *index* in the list of nodes.
  225. *value* can be anything parasable by :py:func:`.parse_anything`, which
  226. includes strings or other :py:class:`~.Wikicode` or :py:class:`~.Node`
  227. objects.
  228. """
  229. nodes = parse_anything(value).nodes
  230. for node in reversed(nodes):
  231. self.nodes.insert(index, node)
  232. def insert_before(self, obj, value, recursive=True):
  233. """Insert *value* immediately before *obj* in the list of nodes.
  234. *obj* can be either a string, a :py:class:`~.Node`, or other
  235. :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`,
  236. for example). *value* can be anything parasable by
  237. :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to
  238. find *obj* within our child nodes even if it is not a direct descendant
  239. of this :py:class:`~.Wikicode` object. If *obj* is not found,
  240. :py:exc:`ValueError` is raised.
  241. """
  242. for restype, context, data in self._do_search(obj, recursive):
  243. if restype in (Node, Wikicode):
  244. i = context.index(data if restype is Node else data[0], False)
  245. context.insert(i, value)
  246. else:
  247. obj = str(obj)
  248. context.nodes = str(context).replace(obj, str(value) + obj)
  249. def insert_after(self, obj, value, recursive=True):
  250. """Insert *value* immediately after *obj* in the list of nodes.
  251. *obj* can be either a string, a :py:class:`~.Node`, or other
  252. :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`,
  253. for example). *value* can be anything parasable by
  254. :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to
  255. find *obj* within our child nodes even if it is not a direct descendant
  256. of this :py:class:`~.Wikicode` object. If *obj* is not found,
  257. :py:exc:`ValueError` is raised.
  258. """
  259. for restype, context, data in self._do_search(obj, recursive):
  260. if restype in (Node, Wikicode):
  261. i = context.index(data if restype is Node else data[-1], False)
  262. context.insert(i + 1, value)
  263. else:
  264. obj = str(obj)
  265. context.nodes = str(context).replace(obj, obj + str(value))
  266. def replace(self, obj, value, recursive=True):
  267. """Replace *obj* with *value* in the list of nodes.
  268. *obj* can be either a string, a :py:class:`~.Node`, or other
  269. :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`,
  270. for example). *value* can be anything parasable by
  271. :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to
  272. find *obj* within our child nodes even if it is not a direct descendant
  273. of this :py:class:`~.Wikicode` object. If *obj* is not found,
  274. :py:exc:`ValueError` is raised.
  275. """
  276. for restype, context, data in self._do_search(obj, recursive):
  277. if restype is Node:
  278. i = context.index(data, False)
  279. context.nodes.pop(i)
  280. context.insert(i, value)
  281. elif restype is Wikicode:
  282. i = context.index(data[0], False)
  283. for _ in data:
  284. context.nodes.pop(i)
  285. context.insert(i, value)
  286. else:
  287. context.nodes = str(context).replace(str(obj), str(value))
  288. def append(self, value):
  289. """Insert *value* at the end of the list of nodes.
  290. *value* can be anything parasable by :py:func:`.parse_anything`.
  291. """
  292. nodes = parse_anything(value).nodes
  293. for node in nodes:
  294. self.nodes.append(node)
  295. def remove(self, obj, recursive=True):
  296. """Remove *obj* from the list of nodes.
  297. *obj* can be either a string, a :py:class:`~.Node`, or other
  298. :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`,
  299. for example). If *recursive* is ``True``, we will try to find *obj*
  300. within our child nodes even if it is not a direct descendant of this
  301. :py:class:`~.Wikicode` object. If *obj* is not found,
  302. :py:exc:`ValueError` is raised.
  303. """
  304. for restype, context, data in self._do_search(obj, recursive):
  305. if restype is Node:
  306. context.nodes.pop(context.index(data, False))
  307. elif restype is Wikicode:
  308. i = context.index(data[0], False)
  309. for _ in data:
  310. context.nodes.pop(i)
  311. else:
  312. context.nodes = str(context).replace(str(obj), "")
  313. def matches(self, other):
  314. """Do a loose equivalency test suitable for comparing page names.
  315. *other* can be any string-like object, including
  316. :py:class:`~.Wikicode`. This operation is symmetric; both sides are
  317. adjusted. Specifically, whitespace and markup is stripped and the first
  318. letter's case is normalized. Typical usage is
  319. ``if template.name.matches("stub"): ...``.
  320. """
  321. this = self.strip_code().strip()
  322. that = parse_anything(other).strip_code().strip()
  323. if not this or not that:
  324. return this == that
  325. return this[0].upper() + this[1:] == that[0].upper() + that[1:]
  326. def ifilter(self, recursive=True, matches=None, flags=FLAGS,
  327. forcetype=None):
  328. """Iterate over nodes in our list matching certain conditions.
  329. If *recursive* is ``True``, we will iterate over our children and all
  330. descendants of our children, otherwise just our immediate children. If
  331. *matches* is given, we will only yield the nodes that match the given
  332. regular expression (with :py:func:`re.search`). The default flags used
  333. are :py:const:`re.IGNORECASE`, :py:const:`re.DOTALL`, and
  334. :py:const:`re.UNICODE`, but custom flags can be specified by passing
  335. *flags*. If *forcetype* is given, only nodes that are instances of this
  336. type are yielded.
  337. """
  338. for node in (self._get_all_nodes(self) if recursive else self.nodes):
  339. if not forcetype or isinstance(node, forcetype):
  340. if not matches or re.search(matches, str(node), flags):
  341. yield node
  342. def filter(self, recursive=True, matches=None, flags=FLAGS,
  343. forcetype=None):
  344. """Return a list of nodes within our list matching certain conditions.
  345. This is equivalent to calling :py:func:`list` on :py:meth:`ifilter`.
  346. """
  347. return list(self.ifilter(recursive, matches, flags, forcetype))
  348. def get_sections(self, levels=None, matches=None, flags=FLAGS,
  349. include_lead=None, include_headings=True):
  350. """Return a list of sections within the page.
  351. Sections are returned as :py:class:`~.Wikicode` objects with a shared
  352. node list (implemented using :py:class:`~.SmartList`) so that changes
  353. to sections are reflected in the parent Wikicode object.
  354. Each section contains all of its subsections. If *levels* is given, it
  355. should be a iterable of integers; only sections whose heading levels
  356. are within it will be returned. If *matches* is given, it should be a
  357. regex to be matched against the titles of section headings; only
  358. sections whose headings match the regex will be included. *flags* can
  359. be used to override the default regex flags (see :py:meth:`ifilter`) if
  360. *matches* is used.
  361. If *include_lead* is ``True``, the first, lead section (without a
  362. heading) will be included in the list; ``False`` will not include it;
  363. the default will include it only if no specific *levels* were given. If
  364. *include_headings* is ``True``, the section's beginning
  365. :py:class:`~.Heading` object will be included; otherwise, this is
  366. skipped.
  367. """
  368. if matches:
  369. matches = r"^(=+?)\s*" + matches + r"\s*\1$"
  370. headings = self.filter_headings()
  371. filtered = self.filter_headings(matches=matches, flags=flags)
  372. if levels:
  373. filtered = [head for head in filtered if head.level in levels]
  374. if matches or include_lead is False or (not include_lead and levels):
  375. buffers = []
  376. else:
  377. buffers = [(maxsize, 0)]
  378. sections = []
  379. i = 0
  380. while i < len(self.nodes):
  381. if self.nodes[i] in headings:
  382. this = self.nodes[i].level
  383. for (level, start) in buffers:
  384. if this <= level:
  385. sections.append(Wikicode(self.nodes[start:i]))
  386. buffers = [buf for buf in buffers if buf[0] < this]
  387. if self.nodes[i] in filtered:
  388. if not include_headings:
  389. i += 1
  390. if i >= len(self.nodes):
  391. break
  392. buffers.append((this, i))
  393. i += 1
  394. for (level, start) in buffers:
  395. if start != i:
  396. sections.append(Wikicode(self.nodes[start:i]))
  397. return sections
  398. def strip_code(self, normalize=True, collapse=True):
  399. """Return a rendered string without unprintable code such as templates.
  400. The way a node is stripped is handled by the
  401. :py:meth:`~.Node.__showtree__` method of :py:class:`~.Node` objects,
  402. which generally return a subset of their nodes or ``None``. For
  403. example, templates and tags are removed completely, links are stripped
  404. to just their display part, headings are stripped to just their title.
  405. If *normalize* is ``True``, various things may be done to strip code
  406. further, such as converting HTML entities like ``&Sigma;``, ``&#931;``,
  407. and ``&#x3a3;`` to ``Σ``. If *collapse* is ``True``, we will try to
  408. remove excess whitespace as well (three or more newlines are converted
  409. to two, for example).
  410. """
  411. nodes = []
  412. for node in self.nodes:
  413. stripped = node.__strip__(normalize, collapse)
  414. if stripped:
  415. nodes.append(str(stripped))
  416. if collapse:
  417. stripped = "".join(nodes).strip("\n")
  418. while "\n\n\n" in stripped:
  419. stripped = stripped.replace("\n\n\n", "\n\n")
  420. return stripped
  421. else:
  422. return "".join(nodes)
  423. def get_tree(self):
  424. """Return a hierarchical tree representation of the object.
  425. The representation is a string makes the most sense printed. It is
  426. built by calling :py:meth:`_get_tree` on the
  427. :py:class:`~.Wikicode` object and its children recursively. The end
  428. result may look something like the following::
  429. >>> text = "Lorem ipsum {{foo|bar|{{baz}}|spam=eggs}}"
  430. >>> print mwparserfromhell.parse(text).get_tree()
  431. Lorem ipsum
  432. {{
  433. foo
  434. | 1
  435. = bar
  436. | 2
  437. = {{
  438. baz
  439. }}
  440. | spam
  441. = eggs
  442. }}
  443. """
  444. marker = object() # Random object we can find with certainty in a list
  445. return "\n".join(self._get_tree(self, [], marker, 0))
  446. Wikicode._build_filter_methods(
  447. arguments=Argument, comments=Comment, external_links=ExternalLink,
  448. headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template,
  449. text=Text, wikilinks=Wikilink)