Browse Source

Merged feature/parser into develop; good enough for now.

tags/v0.1
Ben Kurtovic 11 years ago
parent
commit
222f3b7dc3
17 changed files with 682 additions and 87 deletions
  1. +1
    -1
      mwparserfromhell/nodes/__init__.py
  2. +1
    -1
      mwparserfromhell/nodes/extras/attribute.py
  3. +1
    -1
      mwparserfromhell/nodes/extras/parameter.py
  4. +2
    -2
      mwparserfromhell/nodes/heading.py
  5. +9
    -4
      mwparserfromhell/nodes/html_entity.py
  6. +2
    -2
      mwparserfromhell/nodes/tag.py
  7. +51
    -17
      mwparserfromhell/nodes/template.py
  8. +1
    -1
      mwparserfromhell/nodes/text.py
  9. +19
    -1
      mwparserfromhell/parser/__init__.py
  10. +177
    -0
      mwparserfromhell/parser/builder.py
  11. +41
    -0
      mwparserfromhell/parser/contexts.py
  12. +0
    -53
      mwparserfromhell/parser/demo.py
  13. +285
    -0
      mwparserfromhell/parser/tokenizer.py
  14. +81
    -0
      mwparserfromhell/parser/tokens.py
  15. +2
    -0
      mwparserfromhell/smart_list.py
  16. +5
    -4
      mwparserfromhell/utils.py
  17. +4
    -0
      mwparserfromhell/wikicode.py

+ 1
- 1
mwparserfromhell/nodes/__init__.py View File

@@ -22,7 +22,7 @@

from ..string_mixin import StringMixIn

__all__ = ["Node"]
__all__ = ["Node", "Text", "Heading", "HTMLEntity", "Tag", "Template"]

class Node(StringMixIn):
def __unicode__(self):


+ 1
- 1
mwparserfromhell/nodes/extras/attribute.py View File

@@ -26,7 +26,7 @@ __all__ = ["Attribute"]

class Attribute(StringMixIn):
def __init__(self, name, value=None, quoted=True):
super(Attribute, self).__init__(self)
super(Attribute, self).__init__()
self._name = name
self._value = value
self._quoted = quoted


+ 1
- 1
mwparserfromhell/nodes/extras/parameter.py View File

@@ -27,7 +27,7 @@ __all__ = ["Parameter"]

class Parameter(StringMixIn):
def __init__(self, name, value, showkey=True):
super(Parameter, self).__init__(self)
super(Parameter, self).__init__()
self._name = name
self._value = value
self._showkey = showkey


+ 2
- 2
mwparserfromhell/nodes/heading.py View File

@@ -26,12 +26,12 @@ __all__ = ["Heading"]

class Heading(Node):
def __init__(self, title, level):
super(Heading, self).__init__(self)
super(Heading, self).__init__()
self._title = title
self._level = level

def __unicode__(self):
return ("=" * self.level) + self.title + ("=" * self.level)
return ("=" * self.level) + unicode(self.title) + ("=" * self.level)

def __iternodes__(self, getter):
yield None, self


+ 9
- 4
mwparserfromhell/nodes/html_entity.py View File

@@ -26,9 +26,9 @@ from . import Node

__all__ = ["HTMLEntity"]

class HTMLEntity(Node):
def __init__(self, value, named=None, hexadecimal=False):
super(HTMLEntity, self).__init__(self)
<<<<<<< HEAD
def __init__(self, value, named=None, hexadecimal=False, hex_char="x"):
super(HTMLEntity, self).__init__()
self._value = value
if named is None: # Try to guess whether or not the entity is named
try:
@@ -46,12 +46,13 @@ class HTMLEntity(Node):
else:
self._named = named
self._hexadecimal = hexadecimal
self._hex_char = hex_char

def __unicode__(self):
if self.named:
return u"&{0};".format(self.value)
if self.hexadecimal:
return u"&#x{0};".format(self.value)
return u"&#{0}{1};".format(self.hex_char, self.value)
return u"&#{0};".format(self.value)

def __strip__(self, normalize, collapse):
@@ -94,6 +95,10 @@ class HTMLEntity(Node):
def hexadecimal(self):
return self._hexadecimal

@property
def hex_char(self):
return self._hex_char

def normalize(self):
if self.named:
return unichr(htmlentitydefs.name2codepoint[self.value])


+ 2
- 2
mwparserfromhell/nodes/tag.py View File

@@ -67,9 +67,9 @@ class Tag(Node):
TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE))
TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE

def __init__(self, type_, tag, contents, attrs=None, showtag=True,
def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
self_closing=False, open_padding=0, close_padding=0):
super(Tag, self).__init__(self)
super(Tag, self).__init__()
self._type = type_
self._tag = tag
self._contents = contents


+ 51
- 17
mwparserfromhell/nodes/template.py View File

@@ -33,7 +33,7 @@ FLAGS = re.DOTALL | re.UNICODE

class Template(Node):
def __init__(self, name, params=None):
super(Template, self).__init__(self)
super(Template, self).__init__()
self._name = name
if params:
self._params = params
@@ -77,7 +77,7 @@ class Template(Node):
code.replace(node, node.replace(char, replacement))

def _blank_param_value(self, value):
match = re.search("^(\s*).*?(\s*)$", unicode(value), FLAGS)
match = re.search(r"^(\s*).*?(\s*)$", unicode(value), FLAGS)
value.nodes = [Text(match.group(1)), Text(match.group(2))]

def _select_theory(self, theories):
@@ -91,7 +91,7 @@ class Template(Node):
before_theories = defaultdict(lambda: 0)
after_theories = defaultdict(lambda: 0)
for param in self.params:
match = re.search("^(\s*).*?(\s*)$", unicode(param.value), FLAGS)
match = re.search(r"^(\s*).*?(\s*)$", unicode(param.value), FLAGS)
before, after = match.group(1), match.group(2)
before_theories[before] += 1
after_theories[after] += 1
@@ -100,6 +100,21 @@ class Template(Node):
after = self._select_theory(after_theories)
return before, after

def _remove_with_field(self, param, i, name):
if param.showkey:
following = self.params[i+1:]
better_matches = [after.name.strip() == name and not after.showkey for after in following]
if any(better_matches):
return False
return True

def _remove_without_field(self, param, i, force_no_field):
if not param.showkey and not force_no_field:
dependents = [not after.showkey for after in self.params[i+1:]]
if any(dependents):
return False
return True

@property
def name(self):
return self._name
@@ -119,7 +134,7 @@ class Template(Node):

def get(self, name):
name = name.strip() if isinstance(name, basestring) else unicode(name)
for param in self.params:
for param in reversed(self.params):
if param.name.strip() == name:
return param
raise ValueError(name)
@@ -131,10 +146,10 @@ class Template(Node):
if self.has_param(name):
self.remove(name, keep_field=True)
existing = self.get(name)
if showkey is None: # Infer showkey from current value
showkey = existing.showkey
if not showkey:
self._surface_escape(value, "=")
if showkey is not None:
if not showkey:
self._surface_escape(value, "=")
existing.showkey = showkey
nodes = existing.value.nodes
if force_nonconformity:
existing.value = value
@@ -144,10 +159,20 @@ class Template(Node):

if showkey is None:
try:
int(name)
showkey = True
int_name = int(unicode(name))
except ValueError:
showkey = False
showkey = True
else:
int_keys = set()
for param in self.params:
if not param.showkey:
if re.match(r"[1-9][0-9]*$", param.name.strip()):
int_keys.add(int(unicode(param.name)))
expected = min(set(range(1, len(int_keys) + 2)) - int_keys)
if expected == int_name:
showkey = False
else:
showkey = True
if not showkey:
self._surface_escape(value, "=")
if not force_nonconformity:
@@ -164,12 +189,21 @@ class Template(Node):

def remove(self, name, keep_field=False, force_no_field=False):
name = name.strip() if isinstance(name, basestring) else unicode(name)
removed = False
for i, param in enumerate(self.params):
if param.name.strip() == name:
if keep_field:
return self._blank_param_value(param.value)
dependent = [not after.showkey for after in self.params[i+1:]]
if any(dependent) and not param.showkey and not force_no_field:
return self._blank_param_value(param.value)
return self.params.remove(param)
raise ValueError(name)
if self._remove_with_field(param, i, name):
self._blank_param_value(param.value)
keep_field = False
else:
self.params.remove(param)
else:
if self._remove_without_field(param, i, force_no_field):
self.params.remove(param)
else:
self._blank_param_value(param.value)
if not removed:
removed = True
if not removed:
raise ValueError(name)

+ 1
- 1
mwparserfromhell/nodes/text.py View File

@@ -26,7 +26,7 @@ __all__ = ["Text"]

class Text(Node):
def __init__(self, value):
super(Text, self).__init__(self)
super(Text, self).__init__()
self._value = value

def __unicode__(self):


+ 19
- 1
mwparserfromhell/parser/__init__.py View File

@@ -20,4 +20,22 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from .demo import DemoParser as Parser
try:
from ._builder import CBuilder as Builder
from ._tokenizer import CTokenizer as Tokenizer
except ImportError:
from .builder import Builder
from .tokenizer import Tokenizer

__all__ = ["Parser"]

class Parser(object):
def __init__(self, text):
self.text = text
self._tokenizer = Tokenizer()
self._builder = Builder()

def parse(self):
tokens = self._tokenizer.tokenize(self.text)
code = self._builder.build(tokens)
return code

+ 177
- 0
mwparserfromhell/parser/builder.py View File

@@ -0,0 +1,177 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from . import tokens
from ..nodes import Heading, HTMLEntity, Tag, Template, Text
from ..nodes.extras import Attribute, Parameter
from ..smart_list import SmartList
from ..wikicode import Wikicode

__all__ = ["Builder"]

class Builder(object):
def __init__(self):
self._tokens = []
self._stacks = []

def _wrap(self, nodes):
return Wikicode(SmartList(nodes))

def _push(self):
self._stacks.append([])

def _pop(self, wrap=True):
if wrap:
return self._wrap(self._stacks.pop())
return self._stacks.pop()

def _write(self, item):
self._stacks[-1].append(item)

def _handle_parameter(self, default):
key = None
showkey = False
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.TemplateParamEquals):
key = self._pop()
showkey = True
self._push()
elif isinstance(token, (tokens.TemplateParamSeparator,
tokens.TemplateClose)):
self._tokens.append(token)
value = self._pop()
if not key:
key = self._wrap([Text(unicode(default))])
return Parameter(key, value, showkey)
else:
self._write(self._handle_token(token))

def _handle_template(self):
params = []
default = 1
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.TemplateParamSeparator):
if not params:
name = self._pop()
param = self._handle_parameter(default)
params.append(param)
if not param.showkey:
default += 1
elif isinstance(token, tokens.TemplateClose):
if not params:
name = self._pop()
return Template(name, params)
else:
self._write(self._handle_token(token))

def _handle_entity(self):
token = self._tokens.pop()
if isinstance(token, tokens.HTMLEntityNumeric):
token = self._tokens.pop()
if isinstance(token, tokens.HTMLEntityHex):
text = self._tokens.pop()
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(text.text, named=False, hexadecimal=True,
hex_char=token.char)
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=False, hexadecimal=False)
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=True, hexadecimal=False)

def _handle_heading(self, token):
level = token.level
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.HeadingEnd):
title = self._pop()
return Heading(title, level)
else:
self._write(self._handle_token(token))

def _handle_attribute(self):
name, quoted = None, False
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.TagAttrEquals):
name = self._pop()
self._push()
elif isinstance(token, tokens.TagAttrQuote):
quoted = True
elif isinstance(token, (tokens.TagAttrStart,
tokens.TagCloseOpen)):
self._tokens.append(token)
if name is not None:
return Attribute(name, self._pop(), quoted)
return Attribute(self._pop(), quoted=quoted)
else:
self._write(self._handle_token(token))

def _handle_tag(self, token):
type_, showtag = token.type, token.showtag
attrs = []
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.TagAttrStart):
attrs.append(self._handle_attribute())
elif isinstance(token, tokens.TagCloseOpen):
open_pad = token.padding
tag = self._pop()
self._push()
elif isinstance(token, tokens.TagCloseSelfclose):
tag = self._pop()
return Tag(type_, tag, attrs=attrs, showtag=showtag,
self_closing=True, open_padding=token.padding)
elif isinstance(token, tokens.TagOpenClose):
contents = self._pop()
elif isinstance(token, tokens.TagCloseClose):
return Tag(type_, tag, contents, attrs, showtag, False,
open_pad, token.padding)
else:
self._write(self._handle_token(token))

def _handle_token(self, token):
if isinstance(token, tokens.Text):
return Text(token.text)
elif isinstance(token, tokens.TemplateOpen):
return self._handle_template()
elif isinstance(token, tokens.HTMLEntityStart):
return self._handle_entity()
elif isinstance(token, tokens.HeadingStart):
return self._handle_heading(token)
elif isinstance(token, tokens.TagOpenOpen):
return self._handle_tag(token)

def build(self, tokenlist):
self._tokens = tokenlist
self._tokens.reverse()
self._push()
while self._tokens:
node = self._handle_token(self._tokens.pop())
self._write(node)
return self._pop()

+ 41
- 0
mwparserfromhell/parser/contexts.py View File

@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Local (stack-specific) contexts:

TEMPLATE = 0b000000111
TEMPLATE_NAME = 0b000000001
TEMPLATE_PARAM_KEY = 0b000000010
TEMPLATE_PARAM_VALUE = 0b000000100

HEADING = 0b111111000
HEADING_LEVEL_1 = 0b000001000
HEADING_LEVEL_2 = 0b000010000
HEADING_LEVEL_3 = 0b000100000
HEADING_LEVEL_4 = 0b001000000
HEADING_LEVEL_5 = 0b010000000
HEADING_LEVEL_6 = 0b100000000


# Global contexts:

GL_HEADING = 0b1

+ 0
- 53
mwparserfromhell/parser/demo.py View File

@@ -1,53 +0,0 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from ..nodes import Template, Text
from ..nodes.extras import Parameter
from ..smart_list import SmartList
from ..wikicode import Wikicode

__all__ = ["DemoParser"]

class DemoParser(object):
def __init__(self, text):
self.text = text

def _tokenize(self):
return []

def parse(self):
# Ensure text is unicode!
text = u"This is a {{test}} message with a {{template|with|foo={{params}}}}."

node1 = Text(u"This is a ")
node2 = Template(Wikicode([Text(u"test")]))
node3 = Text(u" message with a ")
node4_param1_name = Wikicode([Text(u"1")])
node4_param1_value = Wikicode([Text(u"with")])
node4_param1 = Parameter(node4_param1_name, node4_param1_value, showkey=False)
node4_param2_name = Wikicode([Text(u"foo")])
node4_param2_value = Wikicode([Template(Wikicode([Text(u"params")]))])
node4_param2 = Parameter(node4_param2_name, node4_param2_value, showkey=True)
node4 = Template(Wikicode([Text(u"template")]), [node4_param1, node4_param2])
node5 = Text(u".")
parsed = Wikicode(SmartList([node1, node2, node3, node4, node5]))
return parsed

+ 285
- 0
mwparserfromhell/parser/tokenizer.py View File

@@ -0,0 +1,285 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import htmlentitydefs
from math import log
import re
import string

from . import contexts
from . import tokens

__all__ = ["Tokenizer"]

class BadRoute(Exception):
pass


class Tokenizer(object):
START = object()
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
"/", "-", "\n", END]
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE)

def __init__(self):
self._text = None
self._head = 0
self._stacks = []
self._global = 0

@property
def _stack(self):
return self._stacks[-1][0]

@property
def _context(self):
return self._stacks[-1][1]

@_context.setter
def _context(self, value):
self._stacks[-1][1] = value

@property
def _textbuffer(self):
return self._stacks[-1][2]

@_textbuffer.setter
def _textbuffer(self, value):
self._stacks[-1][2] = value

def _push(self, context=0):
self._stacks.append([[], context, []])

def _push_textbuffer(self):
if self._textbuffer:
self._stack.append(tokens.Text(text="".join(self._textbuffer)))
self._textbuffer = []

def _pop(self):
self._push_textbuffer()
return self._stacks.pop()[0]

def _fail_route(self):
self._pop()
raise BadRoute()

def _write(self, token):
self._push_textbuffer()
self._stack.append(token)

def _write_text(self, text):
self._textbuffer.append(text)

def _write_all(self, tokenlist):
if tokenlist and isinstance(tokenlist[0], tokens.Text):
self._write_text(tokenlist.pop(0).text)
self._push_textbuffer()
self._stack.extend(tokenlist)

def _read(self, delta=0, wrap=False, strict=False):
index = self._head + delta
if index < 0 and (not wrap or abs(index) > len(self._text)):
return self.START
try:
return self._text[index]
except IndexError:
if strict:
self._fail_route()
return self.END

def _parse_template(self):
reset = self._head
self._head += 2
try:
template = self._parse(contexts.TEMPLATE_NAME)
except BadRoute:
self._head = reset
self._write_text(self._read())
else:
self._write(tokens.TemplateOpen())
self._write_all(template)
self._write(tokens.TemplateClose())

def _verify_template_name(self):
self._push_textbuffer()
if self._stack:
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
text = "".join([token.text for token in text])
if text.strip() and "\n" in text.strip():
self._fail_route()

def _handle_template_param(self):
if self._context & contexts.TEMPLATE_NAME:
self._verify_template_name()
self._context ^= contexts.TEMPLATE_NAME
if self._context & contexts.TEMPLATE_PARAM_VALUE:
self._context ^= contexts.TEMPLATE_PARAM_VALUE
self._context |= contexts.TEMPLATE_PARAM_KEY
self._write(tokens.TemplateParamSeparator())

def _handle_template_param_value(self):
self._context ^= contexts.TEMPLATE_PARAM_KEY
self._context |= contexts.TEMPLATE_PARAM_VALUE
self._write(tokens.TemplateParamEquals())

def _handle_template_end(self):
if self._context & contexts.TEMPLATE_NAME:
self._verify_template_name()
self._head += 1
return self._pop()

def _parse_heading(self):
self._global |= contexts.GL_HEADING
reset = self._head
self._head += 1
best = 1
while self._read() == "=":
best += 1
self._head += 1
context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)

try:
title, level = self._parse(context)
except BadRoute:
self._head = reset + best - 1
self._write_text("=" * best)
else:
self._write(tokens.HeadingStart(level=level))
if level < best:
self._write_text("=" * (best - level))
self._write_all(title)
self._write(tokens.HeadingEnd())
finally:
self._global ^= contexts.GL_HEADING

def _handle_heading_end(self):
reset = self._head
self._head += 1
best = 1
while self._read() == "=":
best += 1
self._head += 1
current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
level = min(current, min(best, 6))

try:
after, after_level = self._parse(self._context)
except BadRoute:
if level < best:
self._write_text("=" * (best - level))
self._head = reset + best - 1
return self._pop(), level
else:
self._write_text("=" * best)
self._write_all(after)
return self._pop(), after_level

def _really_parse_entity(self):
self._write(tokens.HTMLEntityStart())
self._head += 1

this = self._read(strict=True)
if this == "#":
numeric = True
self._write(tokens.HTMLEntityNumeric())
self._head += 1
this = self._read(strict=True)
if this[0].lower() == "x":
hexadecimal = True
self._write(tokens.HTMLEntityHex(char=this[0]))
this = this[1:]
if not this:
self._fail_route()
else:
hexadecimal = False
else:
numeric = hexadecimal = False

valid = string.hexdigits if hexadecimal else string.digits
if not numeric and not hexadecimal:
valid += string.ascii_letters
if not all([char in valid for char in this]):
self._fail_route()

self._head += 1
if self._read() != ";":
self._fail_route()
if numeric:
test = int(this, 16) if hexadecimal else int(this)
if test < 1 or test > 0x10FFFF:
self._fail_route()
else:
if this not in htmlentitydefs.entitydefs:
self._fail_route()

self._write(tokens.Text(text=this))
self._write(tokens.HTMLEntityEnd())

def _parse_entity(self):
reset = self._head
self._push()
try:
self._really_parse_entity()
except BadRoute:
self._head = reset
self._write_text(self._read())
else:
self._write_all(self._pop())

def _parse(self, context=0):
self._push(context)
while True:
this = self._read()
if this not in self.MARKERS:
self._write_text(this)
self._head += 1
continue
if this is self.END:
if self._context & (contexts.TEMPLATE | contexts.HEADING):
self._fail_route()
return self._pop()
prev, next = self._read(-1), self._read(1)
if this == next == "{":
self._parse_template()
elif this == "|" and self._context & contexts.TEMPLATE:
self._handle_template_param()
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
self._handle_template_param_value()
elif this == next == "}" and self._context & contexts.TEMPLATE:
return self._handle_template_end()
elif (prev == "\n" or prev == self.START) and this == "=" and not self._global & contexts.GL_HEADING:
self._parse_heading()
elif this == "=" and self._context & contexts.HEADING:
return self._handle_heading_end()
elif this == "\n" and self._context & contexts.HEADING:
self._fail_route()
elif this == "&":
self._parse_entity()
else:
self._write_text(this)
self._head += 1

def tokenize(self, text):
split = self.regex.split(text)
self._text = [segment for segment in split if segment]
return self._parse()

+ 81
- 0
mwparserfromhell/parser/tokens.py View File

@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

__all__ = ["Token"]

class Token(object):
def __init__(self, **kwargs):
super(Token, self).__setattr__("_kwargs", kwargs)

def __repr__(self):
args = []
for key, value in self._kwargs.iteritems():
if isinstance(value, basestring) and len(value) > 100:
args.append(key + "=" + repr(value[:97] + "..."))
else:
args.append(key + "=" + repr(value))
return u"{0}({1})".format(type(self).__name__, u", ".join(args))

def __eq__(self, other):
if isinstance(other, type(self)):
return self._kwargs == other._kwargs
return False

def __getattr__(self, key):
return self._kwargs[key]

def __setattr__(self, key, value):
self._kwargs[key] = value

def __delattr__(self, key):
del self._kwargs[key]


def make(name):
__all__.append(name)
return type(name, (Token,), {})

Text = make("Text")

TemplateOpen = make("TemplateOpen") # {{
TemplateParamSeparator = make("TemplateParamSeparator") # |
TemplateParamEquals = make("TemplateParamEquals") # =
TemplateClose = make("TemplateClose") # }}

HTMLEntityStart = make("HTMLEntityStart") # &
HTMLEntityNumeric = make("HTMLEntityNumeric") # #
HTMLEntityHex = make("HTMLEntityHex") # x
HTMLEntityEnd = make("HTMLEntityEnd") # ;

HeadingStart = make("HeadingStart") # =...
HeadingEnd = make("HeadingEnd") # =...

TagOpenOpen = make("TagOpenOpen") # <
TagAttrStart = make("TagAttrStart")
TagAttrEquals = make("TagAttrEquals") # =
TagAttrQuote = make("TagAttrQuote") # "
TagCloseOpen = make("TagCloseOpen") # >
TagCloseSelfclose = make("TagCloseSelfclose") # />
TagOpenClose = make("TagOpenClose") # </
TagCloseClose = make("TagCloseClose") # >

del make

+ 2
- 0
mwparserfromhell/smart_list.py View File

@@ -81,6 +81,7 @@ class SmartList(list):

def __iadd__(self, other):
self.extend(other)
return self

def append(self, item):
head = len(self)
@@ -221,6 +222,7 @@ class _ListProxy(list):

def __iadd__(self, other):
self.extend(other)
return self

@property
def _start(self):


+ 5
- 4
mwparserfromhell/utils.py View File

@@ -22,24 +22,25 @@

import mwparserfromhell
from .nodes import Node
from .smart_list import SmartList

def parse_anything(value):
wikicode = mwparserfromhell.wikicode.Wikicode
if isinstance(value, wikicode):
return value
if isinstance(value, Node):
return wikicode([value])
return wikicode(SmartList([value]))
if isinstance(value, basestring):
return mwparserfromhell.parse(value)
if isinstance(value, int):
return mwparserfromhell.parse(unicode(value))
if value is None:
return wikicode([])
return wikicode(SmartList())
try:
nodelist = []
nodelist = SmartList()
for item in value:
nodelist += parse_anything(item).nodes
except TypeError:
error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}"
raise ValueError(error.format(type(value), value))
raise ValueError(error.format(type(value).__name__, value))
return wikicode(nodelist)

+ 4
- 0
mwparserfromhell/wikicode.py View File

@@ -105,6 +105,10 @@ class Wikicode(StringMixIn):
def nodes(self):
return self._nodes

@nodes.setter
def nodes(self, value):
self._nodes = value

def get(self, index):
return self.nodes[index]



Loading…
Cancel
Save