A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

147 lines
3.8 KiB

  1. # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. """
  21. Contains data about certain markup, like HTML tags and external links.
  22. When updating this file, please also update the the C tokenizer version:
  23. - mwparserfromhell/parser/ctokenizer/definitions.c
  24. - mwparserfromhell/parser/ctokenizer/definitions.h
  25. """
  26. __all__ = [
  27. "get_html_tag",
  28. "is_parsable",
  29. "is_visible",
  30. "is_single",
  31. "is_single_only",
  32. "is_scheme",
  33. ]
  34. URI_SCHEMES = {
  35. # [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0
  36. "bitcoin": False,
  37. "ftp": True,
  38. "ftps": True,
  39. "geo": False,
  40. "git": True,
  41. "gopher": True,
  42. "http": True,
  43. "https": True,
  44. "irc": True,
  45. "ircs": True,
  46. "magnet": False,
  47. "mailto": False,
  48. "mms": True,
  49. "news": False,
  50. "nntp": True,
  51. "redis": True,
  52. "sftp": True,
  53. "sip": False,
  54. "sips": False,
  55. "sms": False,
  56. "ssh": True,
  57. "svn": True,
  58. "tel": False,
  59. "telnet": True,
  60. "urn": False,
  61. "worldwind": True,
  62. "xmpp": False,
  63. }
  64. PARSER_BLACKLIST = [
  65. # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
  66. "categorytree",
  67. "ce",
  68. "chem",
  69. "gallery",
  70. "graph",
  71. "hiero",
  72. "imagemap",
  73. "inputbox",
  74. "math",
  75. "nowiki",
  76. "pre",
  77. "score",
  78. "section",
  79. "source",
  80. "syntaxhighlight",
  81. "templatedata",
  82. "timeline",
  83. ]
  84. INVISIBLE_TAGS = [
  85. # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
  86. "categorytree",
  87. "gallery",
  88. "graph",
  89. "imagemap",
  90. "inputbox",
  91. "math",
  92. "score",
  93. "section",
  94. "templatedata",
  95. "timeline",
  96. ]
  97. # [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645
  98. SINGLE_ONLY = ["br", "wbr", "hr", "meta", "link", "img"]
  99. SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]
  100. MARKUP_TO_HTML = {
  101. "#": "li",
  102. "*": "li",
  103. ";": "dt",
  104. ":": "dd",
  105. }
  106. def get_html_tag(markup):
  107. """Return the HTML tag associated with the given wiki-markup."""
  108. return MARKUP_TO_HTML[markup]
  109. def is_parsable(tag):
  110. """Return if the given *tag*'s contents should be passed to the parser."""
  111. return tag.lower() not in PARSER_BLACKLIST
  112. def is_visible(tag):
  113. """Return whether or not the given *tag* contains visible text."""
  114. return tag.lower() not in INVISIBLE_TAGS
  115. def is_single(tag):
  116. """Return whether or not the given *tag* can exist without a close tag."""
  117. return tag.lower() in SINGLE
  118. def is_single_only(tag):
  119. """Return whether or not the given *tag* must exist without a close tag."""
  120. return tag.lower() in SINGLE_ONLY
  121. def is_scheme(scheme, slashes=True):
  122. """Return whether *scheme* is valid for external links."""
  123. scheme = scheme.lower()
  124. if slashes:
  125. return scheme in URI_SCHEMES
  126. return scheme in URI_SCHEMES and not URI_SCHEMES[scheme]