A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

155 lines
5.6 KiB

  1. #
  2. # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included in
  12. # all copies or substantial portions of the Software.
  13. #
  14. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20. # SOFTWARE.
  21. import html.entities as htmlentities
  22. from ._base import Node
  23. __all__ = ["HTMLEntity"]
  24. class HTMLEntity(Node):
  25. """Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""
  26. def __init__(self, value, named=None, hexadecimal=False, hex_char="x"):
  27. super().__init__()
  28. self._value = value
  29. if named is None: # Try to guess whether or not the entity is named
  30. try:
  31. int(value)
  32. self._named = False
  33. self._hexadecimal = False
  34. except ValueError:
  35. try:
  36. int(value, 16)
  37. self._named = False
  38. self._hexadecimal = True
  39. except ValueError:
  40. self._named = True
  41. self._hexadecimal = False
  42. else:
  43. self._named = named
  44. self._hexadecimal = hexadecimal
  45. self._hex_char = hex_char
  46. def __str__(self):
  47. if self.named:
  48. return "&{};".format(self.value)
  49. if self.hexadecimal:
  50. return "&#{}{};".format(self.hex_char, self.value)
  51. return "&#{};".format(self.value)
  52. def __strip__(self, **kwargs):
  53. if kwargs.get("normalize"):
  54. return self.normalize()
  55. return self
  56. @property
  57. def value(self):
  58. """The string value of the HTML entity."""
  59. return self._value
  60. @property
  61. def named(self):
  62. """Whether the entity is a string name for a codepoint or an integer.
  63. For example, ``&Sigma;``, ``&#931;``, and ``&#x3a3;`` refer to the same
  64. character, but only the first is "named", while the others are integer
  65. representations of the codepoint.
  66. """
  67. return self._named
  68. @property
  69. def hexadecimal(self):
  70. """If unnamed, this is whether the value is hexadecimal or decimal."""
  71. return self._hexadecimal
  72. @property
  73. def hex_char(self):
  74. """If the value is hexadecimal, this is the letter denoting that.
  75. For example, the hex_char of ``"&#x1234;"`` is ``"x"``, whereas the
  76. hex_char of ``"&#X1234;"`` is ``"X"``. Lowercase and uppercase ``x``
  77. are the only values supported.
  78. """
  79. return self._hex_char
  80. @value.setter
  81. def value(self, newval):
  82. newval = str(newval)
  83. try:
  84. int(newval)
  85. except ValueError:
  86. try:
  87. intval = int(newval, 16)
  88. except ValueError:
  89. if newval not in htmlentities.entitydefs:
  90. raise ValueError(
  91. "entity value {!r} is not a valid name".format(newval)) from None
  92. self._named = True
  93. self._hexadecimal = False
  94. else:
  95. if intval < 0 or intval > 0x10FFFF:
  96. raise ValueError(
  97. "entity value 0x{:x} is not in range(0x110000)".format(intval)) from None
  98. self._named = False
  99. self._hexadecimal = True
  100. else:
  101. test = int(newval, 16 if self.hexadecimal else 10)
  102. if test < 0 or test > 0x10FFFF:
  103. raise ValueError("entity value {} is not in range(0x110000)".format(test))
  104. self._named = False
  105. self._value = newval
  106. @named.setter
  107. def named(self, newval):
  108. newval = bool(newval)
  109. if newval and self.value not in htmlentities.entitydefs:
  110. raise ValueError("entity value {!r} is not a valid name".format(self.value))
  111. if not newval:
  112. try:
  113. int(self.value, 16)
  114. except ValueError as exc:
  115. raise ValueError("current entity value {!r} is not a valid "
  116. "Unicode codepoint".format(self.value)) from exc
  117. self._named = newval
  118. @hexadecimal.setter
  119. def hexadecimal(self, newval):
  120. newval = bool(newval)
  121. if newval and self.named:
  122. raise ValueError("a named entity cannot be hexadecimal")
  123. self._hexadecimal = newval
  124. @hex_char.setter
  125. def hex_char(self, newval):
  126. newval = str(newval)
  127. if newval not in ("x", "X"):
  128. raise ValueError(newval)
  129. self._hex_char = newval
  130. def normalize(self):
  131. """Return the unicode character represented by the HTML entity."""
  132. if self.named:
  133. return chr(htmlentities.name2codepoint[self.value])
  134. if self.hexadecimal:
  135. return chr(int(self.value, 16))
  136. return chr(int(self.value))