From 43fc28e8e80d99f4b73c7604b538e3c57ab67c1d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 26 Jul 2012 17:11:25 -0400 Subject: [PATCH] Implement Python-build-agnostic version of unichr(). --- mwparserfromhell/nodes/html_entity.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index b849fee..495b14a 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -41,6 +41,28 @@ class HTMLEntity(Node): return u"&#x{0};".format(self.value) return u"&#{0};".format(self.value) + def _unichr(self, value): + """Implement the builtin unichr() with support for non-BMP code points. + + On wide Python builds, this functions like the normal unichr(). On + narrow builds, this returns the value's corresponding surrogate pair. + """ + try: + return unichr(value) + except ValueError: + # Test whether we're on the wide or narrow Python build. Check the + # length of a non-BMP code point (U+1F64A, SPEAK-NO-EVIL MONKEY): + if len(u"\U0001F64A") == 2: + # Ensure this code point is within the range we can encode: + if value > 0x10FFFF: + raise ValueError("unichr() arg not in range(0x110000)") + if value >= 0x10000: + code = value - 0x10000 + lead = 0xD800 + (code >> 10) + trail = 0xDC00 + (code % (1 << 10)) + return unichr(lead) + unichr(trail) + raise + @property def value(self): return self._value @@ -57,5 +79,5 @@ class HTMLEntity(Node): if self.named: return unichr(htmlentitydefs.name2codepoint[self.value]) if self.hexadecimal: - return unichr(int(str(self.value), 16)) - return unichr(self.value) + return self._unichr(int(self.value, 16)) + return self._unichr(int(self.value))