From c47cbb42e9b2a9065d26bd6e52cf1abe9ec6b283 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 16 Jul 2012 03:28:30 -0400 Subject: [PATCH] HTMLEntity; strip_code(); iterators for _nodify(); unicode in StringMixIn's MRO --- mwparserfromhell/nodes/__init__.py | 1 + mwparserfromhell/nodes/html_entity.py | 50 +++++++++++++++++++++++++++++++++++ mwparserfromhell/string_mixin.py | 2 +- mwparserfromhell/wikicode.py | 30 ++++++++++++++++----- 4 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 mwparserfromhell/nodes/html_entity.py diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index bc2a5b4..f75ceac 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -28,5 +28,6 @@ class Node(StringMixIn): pass from mwparserfromhell.nodes import extras +from mwparserfromhell.nodes.html_entity import HTMLEntity from mwparserfromhell.nodes.template import Template from mwparserfromhell.nodes.text import Text diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py new file mode 100644 index 0000000..e150289 --- /dev/null +++ b/mwparserfromhell/nodes/html_entity.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import htmlentitydefs + +from mwparserfromhell.nodes import Node + +__all__ = ["HTMLEntity"] + +class HTMLEntity(Node): + def __init__(self, value, named): + self._value = value + self._named = named + + def __unicode__(self): + if self.named: + return u"&{0};".format(self.value) + return u"&#{0};".format(self.value) + + @property + def value(self): + return self._value + + @property + def named(self): + return self._named + + def normalize(self): + if self.named: + return unichr(htmlentitydefs.name2codepoint[self.value]) + return unichr(self.value) diff --git a/mwparserfromhell/string_mixin.py b/mwparserfromhell/string_mixin.py index 74ae42f..2b3af46 100644 --- a/mwparserfromhell/string_mixin.py +++ b/mwparserfromhell/string_mixin.py @@ -22,7 +22,7 @@ __all__ = ["StringMixIn"] -class StringMixIn(object): +class StringMixIn(unicode): def __str__(self): return unicode(self).encode("utf8") diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 1a87d02..eee1602 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -20,11 +20,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import htmlentitydefs import re import mwparserfromhell -from mwparserfromhell.nodes import Node, Template, Text +from mwparserfromhell.nodes import HTMLEntity, Node, Template, Text from mwparserfromhell.string_mixin import StringMixIn __all__ = ["Wikicode"] @@ -43,10 +42,18 @@ class Wikicode(StringMixIn): return value.nodes if isinstance(value, Node): return [value] - if isinstance(value, str) or isinstance(value, unicode): + if isinstance(value, basestring): return mwparserfromhell.parse(value).nodes - error = "Needs string, Node, or Wikicode object, but got {0}: {1}" - raise ValueError(error.format(type(value), value)) + + try: + nodelist = list(value) + except TypeError: + error = "Needs string, Node, iterable of Nodes, or Wikicode object, but got {0}: {1}" + raise ValueError(error.format(type(value), value)) + if not all([isinstance(node, Node) for node in nodelist]): + error = "Was passed an interable {0}, but it did not contain all Nodes: {1}" + raise ValueError(error.format(type(value), value)) + return nodelist def _get_children(self, node): yield node @@ -217,8 +224,17 @@ class Wikicode(StringMixIn): return list(self.ifilter_text(recursive, matches, flags)) def strip_code(self, normalize=True): - # Magic with htmlentitydefs if normalize - return normalized(u" ".join(self.ifilter_text())) + nodes = [] + for node in self.nodes: + if isinstance(node, Text): + nodes.append(node) + elif isinstance(node, HTMLEntity): + if normalize: + nodes.append(node.normalize()) + else: + nodes.append(node) + + return u" ".join(nodes) def get_tree(self): marker = object() # Random object we can find with certainty in a list