Browse Source

Merge aa4b45c1bb into e6fa7b2b2d

pull/2/merge
Riamse 12 years ago
parent
commit
9667e2de5a
13 changed files with 141 additions and 98 deletions
  1. +1
    -0
      .gitignore
  2. +6
    -3
      mwparserfromhell/nodes/extras/attribute.py
  3. +5
    -2
      mwparserfromhell/nodes/extras/parameter.py
  4. +6
    -5
      mwparserfromhell/nodes/html_entity.py
  5. +4
    -1
      mwparserfromhell/nodes/tag.py
  6. +13
    -11
      mwparserfromhell/nodes/template.py
  7. +4
    -1
      mwparserfromhell/nodes/text.py
  8. +4
    -1
      mwparserfromhell/parser/builder.py
  9. +2
    -1
      mwparserfromhell/parser/tokenizer.py
  10. +13
    -5
      mwparserfromhell/parser/tokens.py
  11. +72
    -62
      mwparserfromhell/string_mixin.py
  12. +4
    -1
      mwparserfromhell/utils.py
  13. +7
    -5
      mwparserfromhell/wikicode.py

+ 1
- 0
.gitignore View File

@@ -2,5 +2,6 @@
*.egg
*.egg-info
.DS_Store
__pycache__
build
docs/_build

+ 6
- 3
mwparserfromhell/nodes/extras/attribute.py View File

@@ -20,7 +20,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from ...string_mixin import StringMixIn
from ...compat import str, bytes

__all__ = ["Attribute"]

@@ -34,9 +37,9 @@ class Attribute(StringMixIn):
def __unicode__(self):
if self.value:
if self.quoted:
return unicode(self.name) + '="' + unicode(self.value) + '"'
return unicode(self.name) + "=" + unicode(self.value)
return unicode(self.name)
return str(self.name) + '="' + str(self.value) + '"'
return str(self.name) + "=" + str(self.value)
return str(self.name)

@property
def name(self):


+ 5
- 2
mwparserfromhell/nodes/extras/parameter.py View File

@@ -20,8 +20,11 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from ...string_mixin import StringMixIn
from ...utils import parse_anything
from ...compat import str, bytes

__all__ = ["Parameter"]

@@ -34,8 +37,8 @@ class Parameter(StringMixIn):

def __unicode__(self):
if self.showkey:
return unicode(self.name) + "=" + unicode(self.value)
return unicode(self.value)
return str(self.name) + "=" + str(self.value)
return str(self.value)

@property
def name(self):


+ 6
- 5
mwparserfromhell/nodes/html_entity.py View File

@@ -20,9 +20,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import htmlentitydefs
from __future__ import unicode_literals

from . import Node
from ..compat import str, bytes, htmlentitydefs

__all__ = ["HTMLEntity"]

@@ -50,10 +51,10 @@ class HTMLEntity(Node):

def __unicode__(self):
if self.named:
return u"&{0};".format(self.value)
return "&{0};".format(self.value)
if self.hexadecimal:
return u"&#{0}{1};".format(self.hex_char, self.value)
return u"&#{0};".format(self.value)
return "&#{0}{1};".format(self.hex_char, self.value)
return "&#{0};".format(self.value)

def __strip__(self, normalize, collapse):
if normalize:
@@ -71,7 +72,7 @@ class HTMLEntity(Node):
except ValueError:
# Test whether we're on the wide or narrow Python build. Check the
# length of a non-BMP code point (U+1F64A, SPEAK-NO-EVIL MONKEY):
if len(u"\U0001F64A") == 2:
if len("\U0001F64A") == 2:
# Ensure this is within the range we can encode:
if value > 0x10FFFF:
raise ValueError("unichr() arg not in range(0x110000)")


+ 4
- 1
mwparserfromhell/nodes/tag.py View File

@@ -20,7 +20,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node, Text
from ..compat import str, bytes

__all__ = ["Tag"]

@@ -92,7 +95,7 @@ class Tag(Node):

result = "<" + unicode(self.tag)
if self.attrs:
result += " " + u" ".join([unicode(attr) for attr in self.attrs])
result += " " + " ".join([unicode(attr) for attr in self.attrs])
if self.self_closing:
result += " " * self.open_padding + "/>"
else:


+ 13
- 11
mwparserfromhell/nodes/template.py View File

@@ -20,12 +20,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
from collections import defaultdict
import re

from . import HTMLEntity, Node, Text
from .extras import Parameter
from ..utils import parse_anything
from ..compat import str, bytes, basestring

__all__ = ["Template"]

@@ -42,10 +44,10 @@ class Template(Node):

def __unicode__(self):
if self.params:
params = u"|".join([unicode(param) for param in self.params])
return "{{" + unicode(self.name) + "|" + params + "}}"
params = "|".join([str(param) for param in self.params])
return "{{" + str(self.name) + "|" + params + "}}"
else:
return "{{" + unicode(self.name) + "}}"
return "{{" + str(self.name) + "}}"

def __iternodes__(self, getter):
yield None, self
@@ -77,7 +79,7 @@ class Template(Node):
code.replace(node, node.replace(char, replacement))

def _blank_param_value(self, value):
match = re.search(r"^(\s*).*?(\s*)$", unicode(value), FLAGS)
match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS)
value.nodes = [Text(match.group(1)), Text(match.group(2))]

def _select_theory(self, theories):
@@ -85,13 +87,13 @@ class Template(Node):
best = max(theories.values())
confidence = float(best) / sum(theories.values())
if confidence > 0.75:
return theories.keys()[theories.values().index(best)]
return tuple(theories.keys())[tuple(theories.values()).index(best)]

def _get_spacing_conventions(self):
before_theories = defaultdict(lambda: 0)
after_theories = defaultdict(lambda: 0)
for param in self.params:
match = re.search(r"^(\s*).*?(\s*)$", unicode(param.value), FLAGS)
match = re.search(r"^(\s*).*?(\s*)$", str(param.value), FLAGS)
before, after = match.group(1), match.group(2)
before_theories[before] += 1
after_theories[after] += 1
@@ -128,7 +130,7 @@ class Template(Node):
self._name = parse_anything(value)

def has_param(self, name, ignore_empty=True):
name = name.strip() if isinstance(name, basestring) else unicode(name)
name = name.strip() if isinstance(name, basestring) else str(name)
for param in self.params:
if param.name.strip() == name:
if ignore_empty and not param.value.strip():
@@ -137,7 +139,7 @@ class Template(Node):
return False

def get(self, name):
name = name.strip() if isinstance(name, basestring) else unicode(name)
name = name.strip() if isinstance(name, basestring) else str(name)
for param in reversed(self.params):
if param.name.strip() == name:
return param
@@ -163,7 +165,7 @@ class Template(Node):

if showkey is None:
try:
int_name = int(unicode(name))
int_name = int(str(name))
except ValueError:
showkey = True
else:
@@ -171,7 +173,7 @@ class Template(Node):
for param in self.params:
if not param.showkey:
if re.match(r"[1-9][0-9]*$", param.name.strip()):
int_keys.add(int(unicode(param.name)))
int_keys.add(int(str(param.name)))
expected = min(set(range(1, len(int_keys) + 2)) - int_keys)
if expected == int_name:
showkey = False
@@ -192,7 +194,7 @@ class Template(Node):
return param

def remove(self, name, keep_field=False, force_no_field=False):
name = name.strip() if isinstance(name, basestring) else unicode(name)
name = name.strip() if isinstance(name, basestring) else str(name)
removed = False
for i, param in enumerate(self.params):
if param.name.strip() == name:


+ 4
- 1
mwparserfromhell/nodes/text.py View File

@@ -20,7 +20,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node
from ..compat import str, bytes, basestring

__all__ = ["Text"]

@@ -30,7 +33,7 @@ class Text(Node):
self._value = value

def __unicode__(self):
return unicode(self.value)
return str(self.value)

def __strip__(self, normalize, collapse):
return self


+ 4
- 1
mwparserfromhell/parser/builder.py View File

@@ -20,11 +20,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import tokens
from ..nodes import Heading, HTMLEntity, Tag, Template, Text
from ..nodes.extras import Attribute, Parameter
from ..smart_list import SmartList
from ..wikicode import Wikicode
from ..compat import str, bytes

__all__ = ["Builder"]

@@ -62,7 +65,7 @@ class Builder(object):
self._tokens.append(token)
value = self._pop()
if not key:
key = self._wrap([Text(unicode(default))])
key = self._wrap([Text(str(default))])
return Parameter(key, value, showkey)
else:
self._write(self._handle_token(token))


+ 2
- 1
mwparserfromhell/parser/tokenizer.py View File

@@ -20,13 +20,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import htmlentitydefs
from __future__ import unicode_literals
from math import log
import re
import string

from . import contexts
from . import tokens
from ..compat import htmlentitydefs

__all__ = ["Tokenizer"]



+ 13
- 5
mwparserfromhell/parser/tokens.py View File

@@ -20,6 +20,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
from ..compat import str, bytes, v

__all__ = ["Token"]

class Token(object):
@@ -33,7 +36,7 @@ class Token(object):
args.append(key + "=" + repr(value[:97] + "..."))
else:
args.append(key + "=" + repr(value))
return u"{0}({1})".format(type(self).__name__, u", ".join(args))
return "{0}({1})".format(type(self).__name__, ", ".join(args))

def __eq__(self, other):
if isinstance(other, type(self)):
@@ -49,10 +52,15 @@ class Token(object):
def __delattr__(self, key):
del self._kwargs[key]


def make(name):
__all__.append(name)
return type(name, (Token,), {})
if v >= 3:
def make(name):
__all__.append(name)
return type(name, (Token,), {})
else:
def make(name):
name = name.encode("utf-8")
__all__.append(name)
return type(name, (Token,), {})

Text = make("Text")



+ 72
- 62
mwparserfromhell/string_mixin.py View File

@@ -20,230 +20,240 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
from .compat import str, bytes, v

__all__ = ["StringMixIn"]

def inheritdoc(method):
method.__doc__ = getattr(unicode, method.func_name).__doc__
try:
method.__doc__ = getattr(str, method.__name__).__doc__
except AttributeError:
method.__doc__ = "This feature is only available on Python 2."
return method


class StringMixIn(object):
def __str__(self):
return unicode(self).encode("utf8")
if v >= 3:
def __str__(self):
return self.__unicode__()
else:
def __str__(self):
return self.__unicode__().encode("utf8")

def __repr__(self):
return repr(unicode(self))
return repr(self.__unicode__())

def __lt__(self, other):
if isinstance(other, StringMixIn):
return unicode(self) < unicode(other)
return unicode(self) < other
return self.__unicode__() < other.__unicode__()
return self.__unicode__() < other

def __le__(self, other):
if isinstance(other, StringMixIn):
return unicode(self) <= unicode(other)
return unicode(self) <= other
return self.__unicode__() <= other.__unicode__()
return self.__unicode__() <= other

def __eq__(self, other):
if isinstance(other, StringMixIn):
return unicode(self) == unicode(other)
return unicode(self) == other
return self.__unicode__() == other.__unicode__()
return self.__unicode__() == other

def __ne__(self, other):
if isinstance(other, StringMixIn):
return unicode(self) != unicode(other)
return unicode(self) != other
return self.__unicode__() != other.__unicode__()
return self.__unicode__() != other

def __gt__(self, other):
if isinstance(other, StringMixIn):
return unicode(self) > unicode(other)
return unicode(self) > other
return self.__unicode__() > other.__unicode__()
return self.__unicode__() > other

def __ge__(self, other):
if isinstance(other, StringMixIn):
return unicode(self) >= unicode(other)
return unicode(self) >= other
return self.__unicode__() >= other.__unicode__()
return self.__unicode__() >= other

def __nonzero__(self):
return bool(unicode(self))
return bool(self.__unicode__())

def __unicode__(self):
raise NotImplementedError()

def __len__(self):
return len(unicode(self))
return len(self.__unicode__())

def __iter__(self):
for char in unicode(self):
for char in self.__unicode__():
yield char

def __getitem__(self, key):
return unicode(self)[key]
return self.__unicode__()[key]

def __contains__(self, item):
if isinstance(item, StringMixIn):
return unicode(item) in unicode(self)
return item in unicode(self)
return unicode(item) in self.__unicode__()
return item in self.__unicode__()

@inheritdoc
def capitalize(self):
return unicode(self).capitalize()
return self.__unicode__().capitalize()

@inheritdoc
def center(self, width, fillchar=None):
return unicode(self).center(width, fillchar)
return self.__unicode__().center(width, fillchar)

@inheritdoc
def count(self, sub=None, start=None, end=None):
return unicode(self).count(sub, start, end)
return self.__unicode__().count(sub, start, end)

@inheritdoc
def decode(self, encoding=None, errors=None):
return unicode(self).decode(encoding, errors)
return self.__unicode__().decode(encoding, errors)

@inheritdoc
def encode(self, encoding=None, errors=None):
return unicode(self).encode(encoding, errors)
return self.__unicode__().encode(encoding, errors)

@inheritdoc
def endswith(self, prefix, start=None, end=None):
return unicode(self).endswith(prefix, start, end)
return self.__unicode__().endswith(prefix, start, end)

@inheritdoc
def expandtabs(self, tabsize=None):
return unicode(self).expandtabs(tabsize)
return self.__unicode__().expandtabs(tabsize)

@inheritdoc
def find(self, sub=None, start=None, end=None):
return unicode(self).find(sub, start, end)
return self.__unicode__().find(sub, start, end)

@inheritdoc
def format(self, *args, **kwargs):
return unicode(self).format(*args, **kwargs)
return self.__unicode__().format(*args, **kwargs)

@inheritdoc
def index(self, sub=None, start=None, end=None):
return unicode(self).index(sub, start, end)
return self.__unicode__().index(sub, start, end)

@inheritdoc
def isalnum(self):
return unicode(self).isalnum()
return self.__unicode__().isalnum()

@inheritdoc
def isalpha(self):
return unicode(self).isalpha()
return self.__unicode__().isalpha()

@inheritdoc
def isdecimal(self):
return unicode(self).isdecimal()
return self.__unicode__().isdecimal()

@inheritdoc
def isdigit(self):
return unicode(self).isdigit()
return self.__unicode__().isdigit()

@inheritdoc
def islower(self):
return unicode(self).islower()
return self.__unicode__().islower()

@inheritdoc
def isnumeric(self):
return unicode(self).isnumeric()
return self.__unicode__().isnumeric()

@inheritdoc
def isspace(self):
return unicode(self).isspace()
return self.__unicode__().isspace()

@inheritdoc
def istitle(self):
return unicode(self).istitle()
return self.__unicode__().istitle()

@inheritdoc
def isupper(self):
return unicode(self).isupper()
return self.__unicode__().isupper()

@inheritdoc
def join(self, iterable):
return unicode(self).join(iterable)
return self.__unicode__().join(iterable)

@inheritdoc
def ljust(self, width, fillchar=None):
return unicode(self).ljust(width, fillchar)
return self.__unicode__().ljust(width, fillchar)

@inheritdoc
def lower(self):
return unicode(self).lower()
return self.__unicode__().lower()

@inheritdoc
def lstrip(self, chars=None):
return unicode(self).lstrip(chars)
return self.__unicode__().lstrip(chars)

@inheritdoc
def partition(self, sep):
return unicode(self).partition(sep)
return self.__unicode__().partition(sep)

@inheritdoc
def replace(self, old, new, count):
return unicode(self).replace(old, new, count)
return self.__unicode__().replace(old, new, count)

@inheritdoc
def rfind(self, sub=None, start=None, end=None):
return unicode(self).rfind(sub, start, end)
return self.__unicode__().rfind(sub, start, end)

@inheritdoc
def rindex(self, sub=None, start=None, end=None):
return unicode(self).rindex(sub, start, end)
return self.__unicode__().rindex(sub, start, end)

@inheritdoc
def rjust(self, width, fillchar=None):
return unicode(self).rjust(width, fillchar)
return self.__unicode__().rjust(width, fillchar)

@inheritdoc
def rpartition(self, sep):
return unicode(self).rpartition(sep)
return self.__unicode__().rpartition(sep)

@inheritdoc
def rsplit(self, sep=None, maxsplit=None):
return unicode(self).rsplit(sep, maxsplit)
return self.__unicode__().rsplit(sep, maxsplit)

@inheritdoc
def rstrip(self, chars=None):
return unicode(self).rstrip(chars)
return self.__unicode__().rstrip(chars)

@inheritdoc
def split(self, sep=None, maxsplit=None):
return unicode(self).split(sep, maxsplit)
return self.__unicode__().split(sep, maxsplit)

@inheritdoc
def splitlines(self, keepends=None):
return unicode(self).splitlines(keepends)
return self.__unicode__().splitlines(keepends)

@inheritdoc
def startswith(self, prefix, start=None, end=None):
return unicode(self).startswith(prefix, start, end)
return self.__unicode__().startswith(prefix, start, end)

@inheritdoc
def strip(self, chars=None):
return unicode(self).strip(chars)
return self.__unicode__().strip(chars)

@inheritdoc
def swapcase(self):
return unicode(self).swapcase()
return self.__unicode__().swapcase()

@inheritdoc
def title(self):
return unicode(self).title()
return self.__unicode__().title()

@inheritdoc
def translate(self, table, deletechars=None):
return unicode(self).translate(table, deletechars)
return self.__unicode__().translate(table, deletechars)

@inheritdoc
def upper(self):
return unicode(self).upper()
return self.__unicode__().upper()

@inheritdoc
def zfill(self, width):
return unicode(self).zfill(width)
return self.__unicode__().zfill(width)


del inheritdoc

+ 4
- 1
mwparserfromhell/utils.py View File

@@ -20,9 +20,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

import mwparserfromhell
from .nodes import Node
from .smart_list import SmartList
from .compat import str, bytes, basestring

def parse_anything(value):
wikicode = mwparserfromhell.wikicode.Wikicode
@@ -33,7 +36,7 @@ def parse_anything(value):
if isinstance(value, basestring):
return mwparserfromhell.parse(value)
if isinstance(value, int):
return mwparserfromhell.parse(unicode(value))
return mwparserfromhell.parse(str(value))
if value is None:
return wikicode(SmartList())
try:


+ 7
- 5
mwparserfromhell/wikicode.py View File

@@ -20,12 +20,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import re
import sys

from .nodes import Heading, Node, Tag, Template, Text
from .string_mixin import StringMixIn
from .utils import parse_anything
from .compat import str, bytes

__all__ = ["Wikicode"]

@@ -40,7 +42,7 @@ class Wikicode(StringMixIn):
self._nodes = nodes

def __unicode__(self):
return "".join([unicode(node) for node in self.nodes])
return "".join([str(node) for node in self.nodes])

def _get_children(self, node):
"""Iterate over all descendants of a given node, including itself.
@@ -193,7 +195,7 @@ class Wikicode(StringMixIn):
nodes = self.nodes
for node in nodes:
if not forcetype or isinstance(node, forcetype):
if not matches or re.search(matches, unicode(node), flags):
if not matches or re.search(matches, str(node), flags):
yield node

def ifilter_templates(self, recursive=False, matches=None, flags=FLAGS):
@@ -251,15 +253,15 @@ class Wikicode(StringMixIn):
for node in self.nodes:
stripped = node.__strip__(normalize, collapse)
if stripped:
nodes.append(unicode(stripped))
nodes.append(str(stripped))

if collapse:
stripped = u"".join(nodes).strip("\n")
stripped = "".join(nodes).strip("\n")
while "\n\n\n" in stripped:
stripped = stripped.replace("\n\n\n", "\n\n")
return stripped
else:
return u"".join(nodes)
return "".join(nodes)

def get_tree(self):
marker = object() # Random object we can find with certainty in a list


Loading…
Cancel
Save