A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

135 lines
3.8 KiB

  1. # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. """
  21. Contains data about certain markup, like HTML tags and external links.
  22. When updating this file, please also update the the C tokenizer version:
  23. - mwparserfromhell/parser/ctokenizer/definitions.c
  24. - mwparserfromhell/parser/ctokenizer/definitions.h
  25. """
  26. __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
  27. "is_single_only", "is_scheme"]
  28. URI_SCHEMES = {
  29. # [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0
  30. "bitcoin": False,
  31. "ftp": True,
  32. "ftps": True,
  33. "geo": False,
  34. "git": True,
  35. "gopher": True,
  36. "http": True,
  37. "https": True,
  38. "irc": True,
  39. "ircs": True,
  40. "magnet": False,
  41. "mailto": False,
  42. "mms": True,
  43. "news": False,
  44. "nntp": True,
  45. "redis": True,
  46. "sftp": True,
  47. "sip": False,
  48. "sips": False,
  49. "sms": False,
  50. "ssh": True,
  51. "svn": True,
  52. "tel": False,
  53. "telnet": True,
  54. "urn": False,
  55. "worldwind": True,
  56. "xmpp": False,
  57. }
  58. PARSER_BLACKLIST = [
  59. # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
  60. "categorytree",
  61. "ce",
  62. "chem",
  63. "gallery",
  64. "graph",
  65. "hiero",
  66. "imagemap",
  67. "inputbox",
  68. "math",
  69. "nowiki",
  70. "pre",
  71. "score",
  72. "section",
  73. "source",
  74. "syntaxhighlight",
  75. "templatedata",
  76. "timeline",
  77. ]
  78. INVISIBLE_TAGS = [
  79. # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
  80. "categorytree",
  81. "gallery",
  82. "graph",
  83. "imagemap",
  84. "inputbox",
  85. "math",
  86. "score",
  87. "section",
  88. "templatedata",
  89. "timeline"
  90. ]
  91. # [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645
  92. SINGLE_ONLY = ["br", "wbr", "hr", "meta", "link", "img"]
  93. SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]
  94. MARKUP_TO_HTML = {
  95. "#": "li",
  96. "*": "li",
  97. ";": "dt",
  98. ":": "dd"
  99. }
  100. def get_html_tag(markup):
  101. """Return the HTML tag associated with the given wiki-markup."""
  102. return MARKUP_TO_HTML[markup]
  103. def is_parsable(tag):
  104. """Return if the given *tag*'s contents should be passed to the parser."""
  105. return tag.lower() not in PARSER_BLACKLIST
  106. def is_visible(tag):
  107. """Return whether or not the given *tag* contains visible text."""
  108. return tag.lower() not in INVISIBLE_TAGS
  109. def is_single(tag):
  110. """Return whether or not the given *tag* can exist without a close tag."""
  111. return tag.lower() in SINGLE
  112. def is_single_only(tag):
  113. """Return whether or not the given *tag* must exist without a close tag."""
  114. return tag.lower() in SINGLE_ONLY
  115. def is_scheme(scheme, slashes=True):
  116. """Return whether *scheme* is valid for external links."""
  117. scheme = scheme.lower()
  118. if slashes:
  119. return scheme in URI_SCHEMES
  120. return scheme in URI_SCHEMES and not URI_SCHEMES[scheme]