A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

154 lines
5.5 KiB

  1. #
  2. # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included in
  12. # all copies or substantial portions of the Software.
  13. #
  14. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20. # SOFTWARE.
  21. import html.entities as htmlentities
  22. from ._base import Node
  23. __all__ = ["HTMLEntity"]
  24. class HTMLEntity(Node):
  25. """Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""
  26. def __init__(self, value, named=None, hexadecimal=False, hex_char="x"):
  27. super().__init__()
  28. self._value = value
  29. if named is None: # Try to guess whether or not the entity is named
  30. try:
  31. int(value)
  32. self._named = False
  33. self._hexadecimal = False
  34. except ValueError:
  35. try:
  36. int(value, 16)
  37. self._named = False
  38. self._hexadecimal = True
  39. except ValueError:
  40. self._named = True
  41. self._hexadecimal = False
  42. else:
  43. self._named = named
  44. self._hexadecimal = hexadecimal
  45. self._hex_char = hex_char
  46. def __str__(self):
  47. if self.named:
  48. return "&{};".format(self.value)
  49. if self.hexadecimal:
  50. return "&#{}{};".format(self.hex_char, self.value)
  51. return "&#{};".format(self.value)
  52. def __strip__(self, **kwargs):
  53. if kwargs.get("normalize"):
  54. return self.normalize()
  55. return self
  56. @property
  57. def value(self):
  58. """The string value of the HTML entity."""
  59. return self._value
  60. @property
  61. def named(self):
  62. """Whether the entity is a string name for a codepoint or an integer.
  63. For example, ``&Sigma;``, ``&#931;``, and ``&#x3a3;`` refer to the same
  64. character, but only the first is "named", while the others are integer
  65. representations of the codepoint.
  66. """
  67. return self._named
  68. @property
  69. def hexadecimal(self):
  70. """If unnamed, this is whether the value is hexadecimal or decimal."""
  71. return self._hexadecimal
  72. @property
  73. def hex_char(self):
  74. """If the value is hexadecimal, this is the letter denoting that.
  75. For example, the hex_char of ``"&#x1234;"`` is ``"x"``, whereas the
  76. hex_char of ``"&#X1234;"`` is ``"X"``. Lowercase and uppercase ``x``
  77. are the only values supported.
  78. """
  79. return self._hex_char
  80. @value.setter
  81. def value(self, newval):
  82. newval = str(newval)
  83. try:
  84. int(newval)
  85. except ValueError:
  86. try:
  87. intval = int(newval, 16)
  88. except ValueError:
  89. if newval not in htmlentities.entitydefs:
  90. raise ValueError(f"entity value {newval!r} is not a valid name") from None
  91. self._named = True
  92. self._hexadecimal = False
  93. else:
  94. if intval < 0 or intval > 0x10FFFF:
  95. raise ValueError(
  96. f"entity value 0x{intval:x} is not in range(0x110000)") from None
  97. self._named = False
  98. self._hexadecimal = True
  99. else:
  100. test = int(newval, 16 if self.hexadecimal else 10)
  101. if test < 0 or test > 0x10FFFF:
  102. raise ValueError(f"entity value {test} is not in range(0x110000)")
  103. self._named = False
  104. self._value = newval
  105. @named.setter
  106. def named(self, newval):
  107. newval = bool(newval)
  108. if newval and self.value not in htmlentities.entitydefs:
  109. raise ValueError(f"entity value {self.value!r} is not a valid name")
  110. if not newval:
  111. try:
  112. int(self.value, 16)
  113. except ValueError as exc:
  114. raise ValueError(f"current entity value {self.value!r} "
  115. f"is not a valid Unicode codepoint") from exc
  116. self._named = newval
  117. @hexadecimal.setter
  118. def hexadecimal(self, newval):
  119. newval = bool(newval)
  120. if newval and self.named:
  121. raise ValueError("a named entity cannot be hexadecimal")
  122. self._hexadecimal = newval
  123. @hex_char.setter
  124. def hex_char(self, newval):
  125. newval = str(newval)
  126. if newval not in ("x", "X"):
  127. raise ValueError(newval)
  128. self._hex_char = newval
  129. def normalize(self):
  130. """Return the unicode character represented by the HTML entity."""
  131. if self.named:
  132. return chr(htmlentities.name2codepoint[self.value])
  133. if self.hexadecimal:
  134. return chr(int(self.value, 16))
  135. return chr(int(self.value))