Browse Source

HTMLEntity; strip_code(); iterators for _nodify(); unicode in StringMixIn's MRO

tags/v0.1
Ben Kurtovic 11 years ago
parent
commit
c47cbb42e9
4 changed files with 75 additions and 8 deletions
  1. +1
    -0
      mwparserfromhell/nodes/__init__.py
  2. +50
    -0
      mwparserfromhell/nodes/html_entity.py
  3. +1
    -1
      mwparserfromhell/string_mixin.py
  4. +23
    -7
      mwparserfromhell/wikicode.py

+ 1
- 0
mwparserfromhell/nodes/__init__.py View File

@@ -28,5 +28,6 @@ class Node(StringMixIn):
pass

from mwparserfromhell.nodes import extras
from mwparserfromhell.nodes.html_entity import HTMLEntity
from mwparserfromhell.nodes.template import Template
from mwparserfromhell.nodes.text import Text

+ 50
- 0
mwparserfromhell/nodes/html_entity.py View File

@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import htmlentitydefs

from mwparserfromhell.nodes import Node

__all__ = ["HTMLEntity"]

class HTMLEntity(Node):
def __init__(self, value, named):
self._value = value
self._named = named

def __unicode__(self):
if self.named:
return u"&{0};".format(self.value)
return u"&#{0};".format(self.value)

@property
def value(self):
return self._value

@property
def named(self):
return self._named

def normalize(self):
if self.named:
return unichr(htmlentitydefs.name2codepoint[self.value])
return unichr(self.value)

+ 1
- 1
mwparserfromhell/string_mixin.py View File

@@ -22,7 +22,7 @@

__all__ = ["StringMixIn"]

class StringMixIn(object):
class StringMixIn(unicode):
def __str__(self):
return unicode(self).encode("utf8")



+ 23
- 7
mwparserfromhell/wikicode.py View File

@@ -20,11 +20,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import htmlentitydefs
import re

import mwparserfromhell
from mwparserfromhell.nodes import Node, Template, Text
from mwparserfromhell.nodes import HTMLEntity, Node, Template, Text
from mwparserfromhell.string_mixin import StringMixIn

__all__ = ["Wikicode"]
@@ -43,10 +42,18 @@ class Wikicode(StringMixIn):
return value.nodes
if isinstance(value, Node):
return [value]
if isinstance(value, str) or isinstance(value, unicode):
if isinstance(value, basestring):
return mwparserfromhell.parse(value).nodes
error = "Needs string, Node, or Wikicode object, but got {0}: {1}"
raise ValueError(error.format(type(value), value))

try:
nodelist = list(value)
except TypeError:
error = "Needs string, Node, iterable of Nodes, or Wikicode object, but got {0}: {1}"
raise ValueError(error.format(type(value), value))
if not all([isinstance(node, Node) for node in nodelist]):
error = "Was passed an interable {0}, but it did not contain all Nodes: {1}"
raise ValueError(error.format(type(value), value))
return nodelist

def _get_children(self, node):
yield node
@@ -217,8 +224,17 @@ class Wikicode(StringMixIn):
return list(self.ifilter_text(recursive, matches, flags))

def strip_code(self, normalize=True):
# Magic with htmlentitydefs if normalize
return normalized(u" ".join(self.ifilter_text()))
nodes = []
for node in self.nodes:
if isinstance(node, Text):
nodes.append(node)
elif isinstance(node, HTMLEntity):
if normalize:
nodes.append(node.normalize())
else:
nodes.append(node)

return u" ".join(nodes)

def get_tree(self):
marker = object() # Random object we can find with certainty in a list


Loading…
Cancel
Save