From 2eb76e7de0ba51731027b710737d95e69c8123e2 Mon Sep 17 00:00:00 2001 From: Riamse Date: Mon, 20 Aug 2012 18:34:08 -0700 Subject: [PATCH] Adding Python 3 support --- .gitignore | 1 + mwparserfromhell/nodes/html_entity.py | 11 +-- mwparserfromhell/nodes/tag.py | 5 +- mwparserfromhell/nodes/template.py | 24 +++--- mwparserfromhell/nodes/text.py | 5 +- mwparserfromhell/parser/builder.py | 5 +- mwparserfromhell/parser/tokenizer.py | 3 +- mwparserfromhell/parser/tokens.py | 18 +++-- mwparserfromhell/string_mixin.py | 134 ++++++++++++++++++---------------- mwparserfromhell/utils.py | 5 +- mwparserfromhell/wikicode.py | 12 +-- 11 files changed, 130 insertions(+), 93 deletions(-) diff --git a/.gitignore b/.gitignore index 4984243..ba02a04 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,6 @@ *.egg *.egg-info .DS_Store +__pycache__ build docs/_build diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index 98bfd2e..f0df4b3 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -20,9 +20,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import htmlentitydefs +from __future__ import unicode_literals from . import Node +from ..compat import str, bytes, htmlentitydefs __all__ = ["HTMLEntity"] @@ -50,10 +51,10 @@ class HTMLEntity(Node): def __unicode__(self): if self.named: - return u"&{0};".format(self.value) + return "&{0};".format(self.value) if self.hexadecimal: - return u"&#{0}{1};".format(self.hex_char, self.value) - return u"&#{0};".format(self.value) + return "&#{0}{1};".format(self.hex_char, self.value) + return "&#{0};".format(self.value) def __strip__(self, normalize, collapse): if normalize: @@ -71,7 +72,7 @@ class HTMLEntity(Node): except ValueError: # Test whether we're on the wide or narrow Python build. Check the # length of a non-BMP code point (U+1F64A, SPEAK-NO-EVIL MONKEY): - if len(u"\U0001F64A") == 2: + if len("\U0001F64A") == 2: # Ensure this is within the range we can encode: if value > 0x10FFFF: raise ValueError("unichr() arg not in range(0x110000)") diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 24654b9..90cfb58 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -20,7 +20,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from __future__ import unicode_literals + from . import Node, Text +from ..compat import str, bytes __all__ = ["Tag"] @@ -92,7 +95,7 @@ class Tag(Node): result = "<" + unicode(self.tag) if self.attrs: - result += " " + u" ".join([unicode(attr) for attr in self.attrs]) + result += " " + " ".join([unicode(attr) for attr in self.attrs]) if self.self_closing: result += " " * self.open_padding + "/>" else: diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 581e8ce..b2a3b0d 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -20,12 +20,14 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from __future__ import unicode_literals from collections import defaultdict import re from . import HTMLEntity, Node, Text from .extras import Parameter from ..utils import parse_anything +from ..compat import str, bytes, basestring __all__ = ["Template"] @@ -42,10 +44,10 @@ class Template(Node): def __unicode__(self): if self.params: - params = u"|".join([unicode(param) for param in self.params]) - return "{{" + unicode(self.name) + "|" + params + "}}" + params = "|".join([str(param) for param in self.params]) + return "{{" + str(self.name) + "|" + params + "}}" else: - return "{{" + unicode(self.name) + "}}" + return "{{" + str(self.name) + "}}" def __iternodes__(self, getter): yield None, self @@ -77,7 +79,7 @@ class Template(Node): code.replace(node, node.replace(char, replacement)) def _blank_param_value(self, value): - match = re.search(r"^(\s*).*?(\s*)$", unicode(value), FLAGS) + match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS) value.nodes = [Text(match.group(1)), Text(match.group(2))] def _select_theory(self, theories): @@ -85,13 +87,13 @@ class Template(Node): best = max(theories.values()) confidence = float(best) / sum(theories.values()) if confidence > 0.75: - return theories.keys()[theories.values().index(best)] + return tuple(theories.keys())[tuple(theories.values()).index(best)] def _get_spacing_conventions(self): before_theories = defaultdict(lambda: 0) after_theories = defaultdict(lambda: 0) for param in self.params: - match = re.search(r"^(\s*).*?(\s*)$", unicode(param.value), FLAGS) + match = re.search(r"^(\s*).*?(\s*)$", str(param.value), FLAGS) before, after = match.group(1), match.group(2) before_theories[before] += 1 after_theories[after] += 1 @@ -124,7 +126,7 @@ class Template(Node): return self._params def has_param(self, name, ignore_empty=True): - name = name.strip() if isinstance(name, basestring) else unicode(name) + name = name.strip() if isinstance(name, basestring) else str(name) for param in self.params: if param.name.strip() == name: if ignore_empty and not param.value.strip(): @@ -133,7 +135,7 @@ class Template(Node): return False def get(self, name): - name = name.strip() if isinstance(name, basestring) else unicode(name) + name = name.strip() if isinstance(name, basestring) else str(name) for param in reversed(self.params): if param.name.strip() == name: return param @@ -159,7 +161,7 @@ class Template(Node): if showkey is None: try: - int_name = int(unicode(name)) + int_name = int(str(name)) except ValueError: showkey = True else: @@ -167,7 +169,7 @@ class Template(Node): for param in self.params: if not param.showkey: if re.match(r"[1-9][0-9]*$", param.name.strip()): - int_keys.add(int(unicode(param.name))) + int_keys.add(int(str(param.name))) expected = min(set(range(1, len(int_keys) + 2)) - int_keys) if expected == int_name: showkey = False @@ -188,7 +190,7 @@ class Template(Node): return param def remove(self, name, keep_field=False, force_no_field=False): - name = name.strip() if isinstance(name, basestring) else unicode(name) + name = name.strip() if isinstance(name, basestring) else str(name) removed = False for i, param in enumerate(self.params): if param.name.strip() == name: diff --git a/mwparserfromhell/nodes/text.py b/mwparserfromhell/nodes/text.py index 4b4d4ac..ce5b513 100644 --- a/mwparserfromhell/nodes/text.py +++ b/mwparserfromhell/nodes/text.py @@ -20,7 +20,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from __future__ import unicode_literals + from . import Node +from ..compat import str, bytes, basestring __all__ = ["Text"] @@ -30,7 +33,7 @@ class Text(Node): self._value = value def __unicode__(self): - return unicode(self.value) + return str(self.value) def __strip__(self, normalize, collapse): return self diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index d352321..fd94788 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -20,11 +20,14 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from __future__ import unicode_literals + from . import tokens from ..nodes import Heading, HTMLEntity, Tag, Template, Text from ..nodes.extras import Attribute, Parameter from ..smart_list import SmartList from ..wikicode import Wikicode +from ..compat import str, bytes __all__ = ["Builder"] @@ -62,7 +65,7 @@ class Builder(object): self._tokens.append(token) value = self._pop() if not key: - key = self._wrap([Text(unicode(default))]) + key = self._wrap([Text(str(default))]) return Parameter(key, value, showkey) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 159ba67..3b80c98 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -20,13 +20,14 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import htmlentitydefs +from __future__ import unicode_literals from math import log import re import string from . import contexts from . import tokens +from ..compat import htmlentitydefs __all__ = ["Tokenizer"] diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 3cb73c9..b76df0d 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -20,6 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from __future__ import unicode_literals +from ..compat import str, bytes, v + __all__ = ["Token"] class Token(object): @@ -33,7 +36,7 @@ class Token(object): args.append(key + "=" + repr(value[:97] + "...")) else: args.append(key + "=" + repr(value)) - return u"{0}({1})".format(type(self).__name__, u", ".join(args)) + return "{0}({1})".format(type(self).__name__, ", ".join(args)) def __eq__(self, other): if isinstance(other, type(self)): @@ -49,10 +52,15 @@ class Token(object): def __delattr__(self, key): del self._kwargs[key] - -def make(name): - __all__.append(name) - return type(name, (Token,), {}) +if v >= 3: + def make(name): + __all__.append(name) + return type(name, (Token,), {}) +else: + def make(name): + name = name.encode("utf-8") + __all__.append(name) + return type(name, (Token,), {}) Text = make("Text") diff --git a/mwparserfromhell/string_mixin.py b/mwparserfromhell/string_mixin.py index e0c8364..99981ae 100644 --- a/mwparserfromhell/string_mixin.py +++ b/mwparserfromhell/string_mixin.py @@ -20,230 +20,240 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from __future__ import unicode_literals +from .compat import str, bytes, v + __all__ = ["StringMixIn"] def inheritdoc(method): - method.__doc__ = getattr(unicode, method.func_name).__doc__ + try: + method.__doc__ = getattr(str, method.__name__).__doc__ + except AttributeError: + method.__doc__ = "This feature is only available on Python 2." return method class StringMixIn(object): - def __str__(self): - return unicode(self).encode("utf8") + if v >= 3: + def __str__(self): + return self.__unicode__() + else: + def __str__(self): + return self.__unicode__().encode("utf8") def __repr__(self): - return repr(unicode(self)) + return repr(self.__unicode__()) def __lt__(self, other): if isinstance(other, StringMixIn): - return unicode(self) < unicode(other) - return unicode(self) < other + return self.__unicode__() < other.__unicode__() + return self.__unicode__() < other def __le__(self, other): if isinstance(other, StringMixIn): - return unicode(self) <= unicode(other) - return unicode(self) <= other + return self.__unicode__() <= other.__unicode__() + return self.__unicode__() <= other def __eq__(self, other): if isinstance(other, StringMixIn): - return unicode(self) == unicode(other) - return unicode(self) == other + return self.__unicode__() == other.__unicode__() + return self.__unicode__() == other def __ne__(self, other): if isinstance(other, StringMixIn): - return unicode(self) != unicode(other) - return unicode(self) != other + return self.__unicode__() != other.__unicode__() + return self.__unicode__() != other def __gt__(self, other): if isinstance(other, StringMixIn): - return unicode(self) > unicode(other) - return unicode(self) > other + return self.__unicode__() > other.__unicode__() + return self.__unicode__() > other def __ge__(self, other): if isinstance(other, StringMixIn): - return unicode(self) >= unicode(other) - return unicode(self) >= other + return self.__unicode__() >= other.__unicode__() + return self.__unicode__() >= other def __nonzero__(self): - return bool(unicode(self)) + return bool(self.__unicode__()) def __unicode__(self): raise NotImplementedError() def __len__(self): - return len(unicode(self)) + return len(self.__unicode__()) def __iter__(self): - for char in unicode(self): + for char in self.__unicode__(): yield char def __getitem__(self, key): - return unicode(self)[key] + return self.__unicode__()[key] def __contains__(self, item): if isinstance(item, StringMixIn): - return unicode(item) in unicode(self) - return item in unicode(self) + return unicode(item) in self.__unicode__() + return item in self.__unicode__() @inheritdoc def capitalize(self): - return unicode(self).capitalize() + return self.__unicode__().capitalize() @inheritdoc def center(self, width, fillchar=None): - return unicode(self).center(width, fillchar) + return self.__unicode__().center(width, fillchar) @inheritdoc def count(self, sub=None, start=None, end=None): - return unicode(self).count(sub, start, end) + return self.__unicode__().count(sub, start, end) @inheritdoc def decode(self, encoding=None, errors=None): - return unicode(self).decode(encoding, errors) + return self.__unicode__().decode(encoding, errors) @inheritdoc def encode(self, encoding=None, errors=None): - return unicode(self).encode(encoding, errors) + return self.__unicode__().encode(encoding, errors) @inheritdoc def endswith(self, prefix, start=None, end=None): - return unicode(self).endswith(prefix, start, end) + return self.__unicode__().endswith(prefix, start, end) @inheritdoc def expandtabs(self, tabsize=None): - return unicode(self).expandtabs(tabsize) + return self.__unicode__().expandtabs(tabsize) @inheritdoc def find(self, sub=None, start=None, end=None): - return unicode(self).find(sub, start, end) + return self.__unicode__().find(sub, start, end) @inheritdoc def format(self, *args, **kwargs): - return unicode(self).format(*args, **kwargs) + return self.__unicode__().format(*args, **kwargs) @inheritdoc def index(self, sub=None, start=None, end=None): - return unicode(self).index(sub, start, end) + return self.__unicode__().index(sub, start, end) @inheritdoc def isalnum(self): - return unicode(self).isalnum() + return self.__unicode__().isalnum() @inheritdoc def isalpha(self): - return unicode(self).isalpha() + return self.__unicode__().isalpha() @inheritdoc def isdecimal(self): - return unicode(self).isdecimal() + return self.__unicode__().isdecimal() @inheritdoc def isdigit(self): - return unicode(self).isdigit() + return self.__unicode__().isdigit() @inheritdoc def islower(self): - return unicode(self).islower() + return self.__unicode__().islower() @inheritdoc def isnumeric(self): - return unicode(self).isnumeric() + return self.__unicode__().isnumeric() @inheritdoc def isspace(self): - return unicode(self).isspace() + return self.__unicode__().isspace() @inheritdoc def istitle(self): - return unicode(self).istitle() + return self.__unicode__().istitle() @inheritdoc def isupper(self): - return unicode(self).isupper() + return self.__unicode__().isupper() @inheritdoc def join(self, iterable): - return unicode(self).join(iterable) + return self.__unicode__().join(iterable) @inheritdoc def ljust(self, width, fillchar=None): - return unicode(self).ljust(width, fillchar) + return self.__unicode__().ljust(width, fillchar) @inheritdoc def lower(self): - return unicode(self).lower() + return self.__unicode__().lower() @inheritdoc def lstrip(self, chars=None): - return unicode(self).lstrip(chars) + return self.__unicode__().lstrip(chars) @inheritdoc def partition(self, sep): - return unicode(self).partition(sep) + return self.__unicode__().partition(sep) @inheritdoc def replace(self, old, new, count): - return unicode(self).replace(old, new, count) + return self.__unicode__().replace(old, new, count) @inheritdoc def rfind(self, sub=None, start=None, end=None): - return unicode(self).rfind(sub, start, end) + return self.__unicode__().rfind(sub, start, end) @inheritdoc def rindex(self, sub=None, start=None, end=None): - return unicode(self).rindex(sub, start, end) + return self.__unicode__().rindex(sub, start, end) @inheritdoc def rjust(self, width, fillchar=None): - return unicode(self).rjust(width, fillchar) + return self.__unicode__().rjust(width, fillchar) @inheritdoc def rpartition(self, sep): - return unicode(self).rpartition(sep) + return self.__unicode__().rpartition(sep) @inheritdoc def rsplit(self, sep=None, maxsplit=None): - return unicode(self).rsplit(sep, maxsplit) + return self.__unicode__().rsplit(sep, maxsplit) @inheritdoc def rstrip(self, chars=None): - return unicode(self).rstrip(chars) + return self.__unicode__().rstrip(chars) @inheritdoc def split(self, sep=None, maxsplit=None): - return unicode(self).split(sep, maxsplit) + return self.__unicode__().split(sep, maxsplit) @inheritdoc def splitlines(self, keepends=None): - return unicode(self).splitlines(keepends) + return self.__unicode__().splitlines(keepends) @inheritdoc def startswith(self, prefix, start=None, end=None): - return unicode(self).startswith(prefix, start, end) + return self.__unicode__().startswith(prefix, start, end) @inheritdoc def strip(self, chars=None): - return unicode(self).strip(chars) + return self.__unicode__().strip(chars) @inheritdoc def swapcase(self): - return unicode(self).swapcase() + return self.__unicode__().swapcase() @inheritdoc def title(self): - return unicode(self).title() + return self.__unicode__().title() @inheritdoc def translate(self, table, deletechars=None): - return unicode(self).translate(table, deletechars) + return self.__unicode__().translate(table, deletechars) @inheritdoc def upper(self): - return unicode(self).upper() + return self.__unicode__().upper() @inheritdoc def zfill(self, width): - return unicode(self).zfill(width) + return self.__unicode__().zfill(width) del inheritdoc diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 9c32c10..b92609a 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -20,9 +20,12 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from __future__ import unicode_literals + import mwparserfromhell from .nodes import Node from .smart_list import SmartList +from .compat import str, bytes, basestring def parse_anything(value): wikicode = mwparserfromhell.wikicode.Wikicode @@ -33,7 +36,7 @@ def parse_anything(value): if isinstance(value, basestring): return mwparserfromhell.parse(value) if isinstance(value, int): - return mwparserfromhell.parse(unicode(value)) + return mwparserfromhell.parse(str(value)) if value is None: return wikicode(SmartList()) try: diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 1199e4b..390fad0 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -20,12 +20,14 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from __future__ import unicode_literals import re import sys from .nodes import Heading, Node, Tag, Template, Text from .string_mixin import StringMixIn from .utils import parse_anything +from .compat import str, bytes __all__ = ["Wikicode"] @@ -37,7 +39,7 @@ class Wikicode(StringMixIn): self._nodes = nodes def __unicode__(self): - return "".join([unicode(node) for node in self.nodes]) + return "".join([str(node) for node in self.nodes]) def _get_children(self, node): for context, child in node.__iternodes__(self._get_all_nodes): @@ -171,7 +173,7 @@ class Wikicode(StringMixIn): nodes = self.nodes for node in nodes: if not forcetype or isinstance(node, forcetype): - if not matches or re.search(matches, unicode(node), flags): + if not matches or re.search(matches, str(node), flags): yield node def ifilter_templates(self, recursive=False, matches=None, flags=FLAGS): @@ -229,15 +231,15 @@ class Wikicode(StringMixIn): for node in self.nodes: stripped = node.__strip__(normalize, collapse) if stripped: - nodes.append(unicode(stripped)) + nodes.append(str(stripped)) if collapse: - stripped = u"".join(nodes).strip("\n") + stripped = "".join(nodes).strip("\n") while "\n\n\n" in stripped: stripped = stripped.replace("\n\n\n", "\n\n") return stripped else: - return u"".join(nodes) + return "".join(nodes) def get_tree(self): marker = object() # Random object we can find with certainty in a list