diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py
index b849fee..495b14a 100644
--- a/mwparserfromhell/nodes/html_entity.py
+++ b/mwparserfromhell/nodes/html_entity.py
@@ -41,6 +41,28 @@ class HTMLEntity(Node):
return u"{0};".format(self.value)
return u"{0};".format(self.value)
+ def _unichr(self, value):
+ """Implement the builtin unichr() with support for non-BMP code points.
+
+ On wide Python builds, this functions like the normal unichr(). On
+ narrow builds, this returns the value's corresponding surrogate pair.
+ """
+ try:
+ return unichr(value)
+ except ValueError:
+ # Test whether we're on the wide or narrow Python build. Check the
+ # length of a non-BMP code point (U+1F64A, SPEAK-NO-EVIL MONKEY):
+ if len(u"\U0001F64A") == 2:
+ # Ensure this code point is within the range we can encode:
+ if value > 0x10FFFF:
+ raise ValueError("unichr() arg not in range(0x110000)")
+ if value >= 0x10000:
+ code = value - 0x10000
+ lead = 0xD800 + (code >> 10)
+ trail = 0xDC00 + (code % (1 << 10))
+ return unichr(lead) + unichr(trail)
+ raise
+
@property
def value(self):
return self._value
@@ -57,5 +79,5 @@ class HTMLEntity(Node):
if self.named:
return unichr(htmlentitydefs.name2codepoint[self.value])
if self.hexadecimal:
- return unichr(int(str(self.value), 16))
- return unichr(self.value)
+ return self._unichr(int(self.value, 16))
+ return self._unichr(int(self.value))