From f1f3fb0121b8f4e67f501cf845f6478131a61b00 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 26 Jul 2012 17:21:10 -0400 Subject: [PATCH] Determine whether an entity is named a bit more intelligently. --- mwparserfromhell/nodes/html_entity.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index 495b14a..8276d7e 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -30,9 +30,21 @@ class HTMLEntity(Node): def __init__(self, value, named=None, hexadecimal=False): self._value = value if named is None: # Try to guess whether or not the entity is named - named = False if isinstance(value, int) else True - self._named = named - self._hexadecimal = hexadecimal + try: + int(value) + self._named = False + self._hexadecimal = False + except ValueError: + try: + int(value, 16) + self._named = False + self._hexadecimal = True + except ValueError: + self._named = True + self._hexadecimal = False + else: + self._named = named + self._hexadecimal = hexadecimal def __unicode__(self): if self.named: @@ -53,14 +65,15 @@ class HTMLEntity(Node): # Test whether we're on the wide or narrow Python build. Check the # length of a non-BMP code point (U+1F64A, SPEAK-NO-EVIL MONKEY): if len(u"\U0001F64A") == 2: - # Ensure this code point is within the range we can encode: + # Ensure this is within the range we can encode: if value > 0x10FFFF: raise ValueError("unichr() arg not in range(0x110000)") - if value >= 0x10000: - code = value - 0x10000 - lead = 0xD800 + (code >> 10) - trail = 0xDC00 + (code % (1 << 10)) - return unichr(lead) + unichr(trail) + code = value - 0x10000 + if value < 0: # Invalid code point + raise + lead = 0xD800 + (code >> 10) + trail = 0xDC00 + (code % (1 << 10)) + return unichr(lead) + unichr(trail) raise @property