A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

153 lines
5.4 KiB

  1. #
  2. # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included in
  12. # all copies or substantial portions of the Software.
  13. #
  14. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20. # SOFTWARE.
  21. import html.entities as htmlentities
  22. from . import Node
  23. __all__ = ["HTMLEntity"]
  24. class HTMLEntity(Node):
  25. """Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""
  26. def __init__(self, value, named=None, hexadecimal=False, hex_char="x"):
  27. super().__init__()
  28. self._value = value
  29. if named is None: # Try to guess whether or not the entity is named
  30. try:
  31. int(value)
  32. self._named = False
  33. self._hexadecimal = False
  34. except ValueError:
  35. try:
  36. int(value, 16)
  37. self._named = False
  38. self._hexadecimal = True
  39. except ValueError:
  40. self._named = True
  41. self._hexadecimal = False
  42. else:
  43. self._named = named
  44. self._hexadecimal = hexadecimal
  45. self._hex_char = hex_char
  46. def __unicode__(self):
  47. if self.named:
  48. return "&{};".format(self.value)
  49. if self.hexadecimal:
  50. return "&#{}{};".format(self.hex_char, self.value)
  51. return "&#{};".format(self.value)
  52. def __strip__(self, **kwargs):
  53. if kwargs.get("normalize"):
  54. return self.normalize()
  55. return self
  56. @property
  57. def value(self):
  58. """The string value of the HTML entity."""
  59. return self._value
  60. @property
  61. def named(self):
  62. """Whether the entity is a string name for a codepoint or an integer.
  63. For example, ``&Sigma;``, ``&#931;``, and ``&#x3a3;`` refer to the same
  64. character, but only the first is "named", while the others are integer
  65. representations of the codepoint.
  66. """
  67. return self._named
  68. @property
  69. def hexadecimal(self):
  70. """If unnamed, this is whether the value is hexadecimal or decimal."""
  71. return self._hexadecimal
  72. @property
  73. def hex_char(self):
  74. """If the value is hexadecimal, this is the letter denoting that.
  75. For example, the hex_char of ``"&#x1234;"`` is ``"x"``, whereas the
  76. hex_char of ``"&#X1234;"`` is ``"X"``. Lowercase and uppercase ``x``
  77. are the only values supported.
  78. """
  79. return self._hex_char
  80. @value.setter
  81. def value(self, newval):
  82. newval = str(newval)
  83. try:
  84. int(newval)
  85. except ValueError:
  86. try:
  87. int(newval, 16)
  88. except ValueError:
  89. if newval not in htmlentities.entitydefs:
  90. raise ValueError("entity value is not a valid name")
  91. self._named = True
  92. self._hexadecimal = False
  93. else:
  94. if int(newval, 16) < 0 or int(newval, 16) > 0x10FFFF:
  95. raise ValueError("entity value is not in range(0x110000)")
  96. self._named = False
  97. self._hexadecimal = True
  98. else:
  99. test = int(newval, 16 if self.hexadecimal else 10)
  100. if test < 0 or test > 0x10FFFF:
  101. raise ValueError("entity value is not in range(0x110000)")
  102. self._named = False
  103. self._value = newval
  104. @named.setter
  105. def named(self, newval):
  106. newval = bool(newval)
  107. if newval and self.value not in htmlentities.entitydefs:
  108. raise ValueError("entity value is not a valid name")
  109. if not newval:
  110. try:
  111. int(self.value, 16)
  112. except ValueError:
  113. err = "current entity value is not a valid Unicode codepoint"
  114. raise ValueError(err)
  115. self._named = newval
  116. @hexadecimal.setter
  117. def hexadecimal(self, newval):
  118. newval = bool(newval)
  119. if newval and self.named:
  120. raise ValueError("a named entity cannot be hexadecimal")
  121. self._hexadecimal = newval
  122. @hex_char.setter
  123. def hex_char(self, newval):
  124. newval = str(newval)
  125. if newval not in ("x", "X"):
  126. raise ValueError(newval)
  127. self._hex_char = newval
  128. def normalize(self):
  129. """Return the unicode character represented by the HTML entity."""
  130. if self.named:
  131. return chr(htmlentities.name2codepoint[self.value])
  132. if self.hexadecimal:
  133. return chr(int(self.value, 16))
  134. return chr(int(self.value))