A Python parser for MediaWiki wikicode https://mwparserfromhell.readthedocs.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

преди 8 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from __future__ import unicode_literals
  23. from . import Node
  24. from ..compat import htmlentities, py3k, str
  25. __all__ = ["HTMLEntity"]
  26. class HTMLEntity(Node):
  27. """Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""
  28. def __init__(self, value, named=None, hexadecimal=False, hex_char="x"):
  29. super(HTMLEntity, self).__init__()
  30. self._value = value
  31. if named is None: # Try to guess whether or not the entity is named
  32. try:
  33. int(value)
  34. self._named = False
  35. self._hexadecimal = False
  36. except ValueError:
  37. try:
  38. int(value, 16)
  39. self._named = False
  40. self._hexadecimal = True
  41. except ValueError:
  42. self._named = True
  43. self._hexadecimal = False
  44. else:
  45. self._named = named
  46. self._hexadecimal = hexadecimal
  47. self._hex_char = hex_char
  48. def __unicode__(self):
  49. if self.named:
  50. return "&{};".format(self.value)
  51. if self.hexadecimal:
  52. return "&#{}{};".format(self.hex_char, self.value)
  53. return "&#{};".format(self.value)
  54. def __strip__(self, **kwargs):
  55. if kwargs.get("normalize"):
  56. return self.normalize()
  57. return self
  58. if not py3k:
  59. @staticmethod
  60. def _unichr(value):
  61. """Implement builtin unichr() with support for non-BMP code points.
  62. On wide Python builds, this functions like the normal unichr(). On
  63. narrow builds, this returns the value's encoded surrogate pair.
  64. """
  65. try:
  66. return unichr(value)
  67. except ValueError:
  68. # Test whether we're on the wide or narrow Python build. Check
  69. # the length of a non-BMP code point
  70. # (U+1F64A, SPEAK-NO-EVIL MONKEY):
  71. if len("\U0001F64A") == 1: # pragma: no cover
  72. raise
  73. # Ensure this is within the range we can encode:
  74. if value > 0x10FFFF:
  75. raise ValueError("unichr() arg not in range(0x110000)")
  76. code = value - 0x10000
  77. if value < 0: # Invalid code point
  78. raise
  79. lead = 0xD800 + (code >> 10)
  80. trail = 0xDC00 + (code % (1 << 10))
  81. return unichr(lead) + unichr(trail)
  82. @property
  83. def value(self):
  84. """The string value of the HTML entity."""
  85. return self._value
  86. @property
  87. def named(self):
  88. """Whether the entity is a string name for a codepoint or an integer.
  89. For example, ``&Sigma;``, ``&#931;``, and ``&#x3a3;`` refer to the same
  90. character, but only the first is "named", while the others are integer
  91. representations of the codepoint.
  92. """
  93. return self._named
  94. @property
  95. def hexadecimal(self):
  96. """If unnamed, this is whether the value is hexadecimal or decimal."""
  97. return self._hexadecimal
  98. @property
  99. def hex_char(self):
  100. """If the value is hexadecimal, this is the letter denoting that.
  101. For example, the hex_char of ``"&#x1234;"`` is ``"x"``, whereas the
  102. hex_char of ``"&#X1234;"`` is ``"X"``. Lowercase and uppercase ``x``
  103. are the only values supported.
  104. """
  105. return self._hex_char
  106. @value.setter
  107. def value(self, newval):
  108. newval = str(newval)
  109. try:
  110. int(newval)
  111. except ValueError:
  112. try:
  113. int(newval, 16)
  114. except ValueError:
  115. if newval not in htmlentities.entitydefs:
  116. raise ValueError("entity value is not a valid name")
  117. self._named = True
  118. self._hexadecimal = False
  119. else:
  120. if int(newval, 16) < 0 or int(newval, 16) > 0x10FFFF:
  121. raise ValueError("entity value is not in range(0x110000)")
  122. self._named = False
  123. self._hexadecimal = True
  124. else:
  125. test = int(newval, 16 if self.hexadecimal else 10)
  126. if test < 0 or test > 0x10FFFF:
  127. raise ValueError("entity value is not in range(0x110000)")
  128. self._named = False
  129. self._value = newval
  130. @named.setter
  131. def named(self, newval):
  132. newval = bool(newval)
  133. if newval and self.value not in htmlentities.entitydefs:
  134. raise ValueError("entity value is not a valid name")
  135. if not newval:
  136. try:
  137. int(self.value, 16)
  138. except ValueError:
  139. err = "current entity value is not a valid Unicode codepoint"
  140. raise ValueError(err)
  141. self._named = newval
  142. @hexadecimal.setter
  143. def hexadecimal(self, newval):
  144. newval = bool(newval)
  145. if newval and self.named:
  146. raise ValueError("a named entity cannot be hexadecimal")
  147. self._hexadecimal = newval
  148. @hex_char.setter
  149. def hex_char(self, newval):
  150. newval = str(newval)
  151. if newval not in ("x", "X"):
  152. raise ValueError(newval)
  153. self._hex_char = newval
  154. def normalize(self):
  155. """Return the unicode character represented by the HTML entity."""
  156. chrfunc = chr if py3k else HTMLEntity._unichr
  157. if self.named:
  158. return chrfunc(htmlentities.name2codepoint[self.value])
  159. if self.hexadecimal:
  160. return chrfunc(int(self.value, 16))
  161. return chrfunc(int(self.value))