diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index 94451ab..99bdc58 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -22,7 +22,7 @@ from ..string_mixin import StringMixIn -__all__ = ["Node"] +__all__ = ["Node", "Text", "Heading", "HTMLEntity", "Tag", "Template"] class Node(StringMixIn): def __unicode__(self): diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index cd128a9..d7368b0 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -26,7 +26,7 @@ __all__ = ["Attribute"] class Attribute(StringMixIn): def __init__(self, name, value=None, quoted=True): - super(Attribute, self).__init__(self) + super(Attribute, self).__init__() self._name = name self._value = value self._quoted = quoted diff --git a/mwparserfromhell/nodes/extras/parameter.py b/mwparserfromhell/nodes/extras/parameter.py index 3f15ec6..ac5dde0 100644 --- a/mwparserfromhell/nodes/extras/parameter.py +++ b/mwparserfromhell/nodes/extras/parameter.py @@ -27,7 +27,7 @@ __all__ = ["Parameter"] class Parameter(StringMixIn): def __init__(self, name, value, showkey=True): - super(Parameter, self).__init__(self) + super(Parameter, self).__init__() self._name = name self._value = value self._showkey = showkey diff --git a/mwparserfromhell/nodes/heading.py b/mwparserfromhell/nodes/heading.py index c2bd5a7..7ac8065 100644 --- a/mwparserfromhell/nodes/heading.py +++ b/mwparserfromhell/nodes/heading.py @@ -26,12 +26,12 @@ __all__ = ["Heading"] class Heading(Node): def __init__(self, title, level): - super(Heading, self).__init__(self) + super(Heading, self).__init__() self._title = title self._level = level def __unicode__(self): - return ("=" * self.level) + self.title + ("=" * self.level) + return ("=" * self.level) + unicode(self.title) + ("=" * self.level) def __iternodes__(self, getter): yield None, self diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index ee68691..ea18ecf 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -26,9 +26,9 @@ from . import Node __all__ = ["HTMLEntity"] -class HTMLEntity(Node): - def __init__(self, value, named=None, hexadecimal=False): - super(HTMLEntity, self).__init__(self) +<<<<<<< HEAD + def __init__(self, value, named=None, hexadecimal=False, hex_char="x"): + super(HTMLEntity, self).__init__() self._value = value if named is None: # Try to guess whether or not the entity is named try: @@ -46,12 +46,13 @@ class HTMLEntity(Node): else: self._named = named self._hexadecimal = hexadecimal + self._hex_char = hex_char def __unicode__(self): if self.named: return u"&{0};".format(self.value) if self.hexadecimal: - return u"&#x{0};".format(self.value) + return u"&#{0}{1};".format(self.hex_char, self.value) return u"&#{0};".format(self.value) def __strip__(self, normalize, collapse): @@ -94,6 +95,10 @@ class HTMLEntity(Node): def hexadecimal(self): return self._hexadecimal + @property + def hex_char(self): + return self._hex_char + def normalize(self): if self.named: return unichr(htmlentitydefs.name2codepoint[self.value]) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 5d911a8..24654b9 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -67,9 +67,9 @@ class Tag(Node): TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE - def __init__(self, type_, tag, contents, attrs=None, showtag=True, + def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, self_closing=False, open_padding=0, close_padding=0): - super(Tag, self).__init__(self) + super(Tag, self).__init__() self._type = type_ self._tag = tag self._contents = contents diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 99315d7..581e8ce 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -33,7 +33,7 @@ FLAGS = re.DOTALL | re.UNICODE class Template(Node): def __init__(self, name, params=None): - super(Template, self).__init__(self) + super(Template, self).__init__() self._name = name if params: self._params = params @@ -77,7 +77,7 @@ class Template(Node): code.replace(node, node.replace(char, replacement)) def _blank_param_value(self, value): - match = re.search("^(\s*).*?(\s*)$", unicode(value), FLAGS) + match = re.search(r"^(\s*).*?(\s*)$", unicode(value), FLAGS) value.nodes = [Text(match.group(1)), Text(match.group(2))] def _select_theory(self, theories): @@ -91,7 +91,7 @@ class Template(Node): before_theories = defaultdict(lambda: 0) after_theories = defaultdict(lambda: 0) for param in self.params: - match = re.search("^(\s*).*?(\s*)$", unicode(param.value), FLAGS) + match = re.search(r"^(\s*).*?(\s*)$", unicode(param.value), FLAGS) before, after = match.group(1), match.group(2) before_theories[before] += 1 after_theories[after] += 1 @@ -100,6 +100,21 @@ class Template(Node): after = self._select_theory(after_theories) return before, after + def _remove_with_field(self, param, i, name): + if param.showkey: + following = self.params[i+1:] + better_matches = [after.name.strip() == name and not after.showkey for after in following] + if any(better_matches): + return False + return True + + def _remove_without_field(self, param, i, force_no_field): + if not param.showkey and not force_no_field: + dependents = [not after.showkey for after in self.params[i+1:]] + if any(dependents): + return False + return True + @property def name(self): return self._name @@ -119,7 +134,7 @@ class Template(Node): def get(self, name): name = name.strip() if isinstance(name, basestring) else unicode(name) - for param in self.params: + for param in reversed(self.params): if param.name.strip() == name: return param raise ValueError(name) @@ -131,10 +146,10 @@ class Template(Node): if self.has_param(name): self.remove(name, keep_field=True) existing = self.get(name) - if showkey is None: # Infer showkey from current value - showkey = existing.showkey - if not showkey: - self._surface_escape(value, "=") + if showkey is not None: + if not showkey: + self._surface_escape(value, "=") + existing.showkey = showkey nodes = existing.value.nodes if force_nonconformity: existing.value = value @@ -144,10 +159,20 @@ class Template(Node): if showkey is None: try: - int(name) - showkey = True + int_name = int(unicode(name)) except ValueError: - showkey = False + showkey = True + else: + int_keys = set() + for param in self.params: + if not param.showkey: + if re.match(r"[1-9][0-9]*$", param.name.strip()): + int_keys.add(int(unicode(param.name))) + expected = min(set(range(1, len(int_keys) + 2)) - int_keys) + if expected == int_name: + showkey = False + else: + showkey = True if not showkey: self._surface_escape(value, "=") if not force_nonconformity: @@ -164,12 +189,21 @@ class Template(Node): def remove(self, name, keep_field=False, force_no_field=False): name = name.strip() if isinstance(name, basestring) else unicode(name) + removed = False for i, param in enumerate(self.params): if param.name.strip() == name: if keep_field: - return self._blank_param_value(param.value) - dependent = [not after.showkey for after in self.params[i+1:]] - if any(dependent) and not param.showkey and not force_no_field: - return self._blank_param_value(param.value) - return self.params.remove(param) - raise ValueError(name) + if self._remove_with_field(param, i, name): + self._blank_param_value(param.value) + keep_field = False + else: + self.params.remove(param) + else: + if self._remove_without_field(param, i, force_no_field): + self.params.remove(param) + else: + self._blank_param_value(param.value) + if not removed: + removed = True + if not removed: + raise ValueError(name) diff --git a/mwparserfromhell/nodes/text.py b/mwparserfromhell/nodes/text.py index 82cad96..4b4d4ac 100644 --- a/mwparserfromhell/nodes/text.py +++ b/mwparserfromhell/nodes/text.py @@ -26,7 +26,7 @@ __all__ = ["Text"] class Text(Node): def __init__(self, value): - super(Text, self).__init__(self) + super(Text, self).__init__() self._value = value def __unicode__(self): diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index c32a549..49ea940 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -20,4 +20,22 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from .demo import DemoParser as Parser +try: + from ._builder import CBuilder as Builder + from ._tokenizer import CTokenizer as Tokenizer +except ImportError: + from .builder import Builder + from .tokenizer import Tokenizer + +__all__ = ["Parser"] + +class Parser(object): + def __init__(self, text): + self.text = text + self._tokenizer = Tokenizer() + self._builder = Builder() + + def parse(self): + tokens = self._tokenizer.tokenize(self.text) + code = self._builder.build(tokens) + return code diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py new file mode 100644 index 0000000..d352321 --- /dev/null +++ b/mwparserfromhell/parser/builder.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from . import tokens +from ..nodes import Heading, HTMLEntity, Tag, Template, Text +from ..nodes.extras import Attribute, Parameter +from ..smart_list import SmartList +from ..wikicode import Wikicode + +__all__ = ["Builder"] + +class Builder(object): + def __init__(self): + self._tokens = [] + self._stacks = [] + + def _wrap(self, nodes): + return Wikicode(SmartList(nodes)) + + def _push(self): + self._stacks.append([]) + + def _pop(self, wrap=True): + if wrap: + return self._wrap(self._stacks.pop()) + return self._stacks.pop() + + def _write(self, item): + self._stacks[-1].append(item) + + def _handle_parameter(self, default): + key = None + showkey = False + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.TemplateParamEquals): + key = self._pop() + showkey = True + self._push() + elif isinstance(token, (tokens.TemplateParamSeparator, + tokens.TemplateClose)): + self._tokens.append(token) + value = self._pop() + if not key: + key = self._wrap([Text(unicode(default))]) + return Parameter(key, value, showkey) + else: + self._write(self._handle_token(token)) + + def _handle_template(self): + params = [] + default = 1 + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.TemplateParamSeparator): + if not params: + name = self._pop() + param = self._handle_parameter(default) + params.append(param) + if not param.showkey: + default += 1 + elif isinstance(token, tokens.TemplateClose): + if not params: + name = self._pop() + return Template(name, params) + else: + self._write(self._handle_token(token)) + + def _handle_entity(self): + token = self._tokens.pop() + if isinstance(token, tokens.HTMLEntityNumeric): + token = self._tokens.pop() + if isinstance(token, tokens.HTMLEntityHex): + text = self._tokens.pop() + self._tokens.pop() # Remove HTMLEntityEnd + return HTMLEntity(text.text, named=False, hexadecimal=True, + hex_char=token.char) + self._tokens.pop() # Remove HTMLEntityEnd + return HTMLEntity(token.text, named=False, hexadecimal=False) + self._tokens.pop() # Remove HTMLEntityEnd + return HTMLEntity(token.text, named=True, hexadecimal=False) + + def _handle_heading(self, token): + level = token.level + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.HeadingEnd): + title = self._pop() + return Heading(title, level) + else: + self._write(self._handle_token(token)) + + def _handle_attribute(self): + name, quoted = None, False + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.TagAttrEquals): + name = self._pop() + self._push() + elif isinstance(token, tokens.TagAttrQuote): + quoted = True + elif isinstance(token, (tokens.TagAttrStart, + tokens.TagCloseOpen)): + self._tokens.append(token) + if name is not None: + return Attribute(name, self._pop(), quoted) + return Attribute(self._pop(), quoted=quoted) + else: + self._write(self._handle_token(token)) + + def _handle_tag(self, token): + type_, showtag = token.type, token.showtag + attrs = [] + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.TagAttrStart): + attrs.append(self._handle_attribute()) + elif isinstance(token, tokens.TagCloseOpen): + open_pad = token.padding + tag = self._pop() + self._push() + elif isinstance(token, tokens.TagCloseSelfclose): + tag = self._pop() + return Tag(type_, tag, attrs=attrs, showtag=showtag, + self_closing=True, open_padding=token.padding) + elif isinstance(token, tokens.TagOpenClose): + contents = self._pop() + elif isinstance(token, tokens.TagCloseClose): + return Tag(type_, tag, contents, attrs, showtag, False, + open_pad, token.padding) + else: + self._write(self._handle_token(token)) + + def _handle_token(self, token): + if isinstance(token, tokens.Text): + return Text(token.text) + elif isinstance(token, tokens.TemplateOpen): + return self._handle_template() + elif isinstance(token, tokens.HTMLEntityStart): + return self._handle_entity() + elif isinstance(token, tokens.HeadingStart): + return self._handle_heading(token) + elif isinstance(token, tokens.TagOpenOpen): + return self._handle_tag(token) + + def build(self, tokenlist): + self._tokens = tokenlist + self._tokens.reverse() + self._push() + while self._tokens: + node = self._handle_token(self._tokens.pop()) + self._write(node) + return self._pop() diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py new file mode 100644 index 0000000..6369ee2 --- /dev/null +++ b/mwparserfromhell/parser/contexts.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Local (stack-specific) contexts: + +TEMPLATE = 0b000000111 +TEMPLATE_NAME = 0b000000001 +TEMPLATE_PARAM_KEY = 0b000000010 +TEMPLATE_PARAM_VALUE = 0b000000100 + +HEADING = 0b111111000 +HEADING_LEVEL_1 = 0b000001000 +HEADING_LEVEL_2 = 0b000010000 +HEADING_LEVEL_3 = 0b000100000 +HEADING_LEVEL_4 = 0b001000000 +HEADING_LEVEL_5 = 0b010000000 +HEADING_LEVEL_6 = 0b100000000 + + +# Global contexts: + +GL_HEADING = 0b1 diff --git a/mwparserfromhell/parser/demo.py b/mwparserfromhell/parser/demo.py deleted file mode 100644 index f8ce479..0000000 --- a/mwparserfromhell/parser/demo.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2012 Ben Kurtovic -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -from ..nodes import Template, Text -from ..nodes.extras import Parameter -from ..smart_list import SmartList -from ..wikicode import Wikicode - -__all__ = ["DemoParser"] - -class DemoParser(object): - def __init__(self, text): - self.text = text - - def _tokenize(self): - return [] - - def parse(self): - # Ensure text is unicode! - text = u"This is a {{test}} message with a {{template|with|foo={{params}}}}." - - node1 = Text(u"This is a ") - node2 = Template(Wikicode([Text(u"test")])) - node3 = Text(u" message with a ") - node4_param1_name = Wikicode([Text(u"1")]) - node4_param1_value = Wikicode([Text(u"with")]) - node4_param1 = Parameter(node4_param1_name, node4_param1_value, showkey=False) - node4_param2_name = Wikicode([Text(u"foo")]) - node4_param2_value = Wikicode([Template(Wikicode([Text(u"params")]))]) - node4_param2 = Parameter(node4_param2_name, node4_param2_value, showkey=True) - node4 = Template(Wikicode([Text(u"template")]), [node4_param1, node4_param2]) - node5 = Text(u".") - parsed = Wikicode(SmartList([node1, node2, node3, node4, node5])) - return parsed diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py new file mode 100644 index 0000000..159ba67 --- /dev/null +++ b/mwparserfromhell/parser/tokenizer.py @@ -0,0 +1,285 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import htmlentitydefs +from math import log +import re +import string + +from . import contexts +from . import tokens + +__all__ = ["Tokenizer"] + +class BadRoute(Exception): + pass + + +class Tokenizer(object): + START = object() + END = object() + MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", + "/", "-", "\n", END] + regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE) + + def __init__(self): + self._text = None + self._head = 0 + self._stacks = [] + self._global = 0 + + @property + def _stack(self): + return self._stacks[-1][0] + + @property + def _context(self): + return self._stacks[-1][1] + + @_context.setter + def _context(self, value): + self._stacks[-1][1] = value + + @property + def _textbuffer(self): + return self._stacks[-1][2] + + @_textbuffer.setter + def _textbuffer(self, value): + self._stacks[-1][2] = value + + def _push(self, context=0): + self._stacks.append([[], context, []]) + + def _push_textbuffer(self): + if self._textbuffer: + self._stack.append(tokens.Text(text="".join(self._textbuffer))) + self._textbuffer = [] + + def _pop(self): + self._push_textbuffer() + return self._stacks.pop()[0] + + def _fail_route(self): + self._pop() + raise BadRoute() + + def _write(self, token): + self._push_textbuffer() + self._stack.append(token) + + def _write_text(self, text): + self._textbuffer.append(text) + + def _write_all(self, tokenlist): + if tokenlist and isinstance(tokenlist[0], tokens.Text): + self._write_text(tokenlist.pop(0).text) + self._push_textbuffer() + self._stack.extend(tokenlist) + + def _read(self, delta=0, wrap=False, strict=False): + index = self._head + delta + if index < 0 and (not wrap or abs(index) > len(self._text)): + return self.START + try: + return self._text[index] + except IndexError: + if strict: + self._fail_route() + return self.END + + def _parse_template(self): + reset = self._head + self._head += 2 + try: + template = self._parse(contexts.TEMPLATE_NAME) + except BadRoute: + self._head = reset + self._write_text(self._read()) + else: + self._write(tokens.TemplateOpen()) + self._write_all(template) + self._write(tokens.TemplateClose()) + + def _verify_template_name(self): + self._push_textbuffer() + if self._stack: + text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] + text = "".join([token.text for token in text]) + if text.strip() and "\n" in text.strip(): + self._fail_route() + + def _handle_template_param(self): + if self._context & contexts.TEMPLATE_NAME: + self._verify_template_name() + self._context ^= contexts.TEMPLATE_NAME + if self._context & contexts.TEMPLATE_PARAM_VALUE: + self._context ^= contexts.TEMPLATE_PARAM_VALUE + self._context |= contexts.TEMPLATE_PARAM_KEY + self._write(tokens.TemplateParamSeparator()) + + def _handle_template_param_value(self): + self._context ^= contexts.TEMPLATE_PARAM_KEY + self._context |= contexts.TEMPLATE_PARAM_VALUE + self._write(tokens.TemplateParamEquals()) + + def _handle_template_end(self): + if self._context & contexts.TEMPLATE_NAME: + self._verify_template_name() + self._head += 1 + return self._pop() + + def _parse_heading(self): + self._global |= contexts.GL_HEADING + reset = self._head + self._head += 1 + best = 1 + while self._read() == "=": + best += 1 + self._head += 1 + context = contexts.HEADING_LEVEL_1 << min(best - 1, 5) + + try: + title, level = self._parse(context) + except BadRoute: + self._head = reset + best - 1 + self._write_text("=" * best) + else: + self._write(tokens.HeadingStart(level=level)) + if level < best: + self._write_text("=" * (best - level)) + self._write_all(title) + self._write(tokens.HeadingEnd()) + finally: + self._global ^= contexts.GL_HEADING + + def _handle_heading_end(self): + reset = self._head + self._head += 1 + best = 1 + while self._read() == "=": + best += 1 + self._head += 1 + current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1 + level = min(current, min(best, 6)) + + try: + after, after_level = self._parse(self._context) + except BadRoute: + if level < best: + self._write_text("=" * (best - level)) + self._head = reset + best - 1 + return self._pop(), level + else: + self._write_text("=" * best) + self._write_all(after) + return self._pop(), after_level + + def _really_parse_entity(self): + self._write(tokens.HTMLEntityStart()) + self._head += 1 + + this = self._read(strict=True) + if this == "#": + numeric = True + self._write(tokens.HTMLEntityNumeric()) + self._head += 1 + this = self._read(strict=True) + if this[0].lower() == "x": + hexadecimal = True + self._write(tokens.HTMLEntityHex(char=this[0])) + this = this[1:] + if not this: + self._fail_route() + else: + hexadecimal = False + else: + numeric = hexadecimal = False + + valid = string.hexdigits if hexadecimal else string.digits + if not numeric and not hexadecimal: + valid += string.ascii_letters + if not all([char in valid for char in this]): + self._fail_route() + + self._head += 1 + if self._read() != ";": + self._fail_route() + if numeric: + test = int(this, 16) if hexadecimal else int(this) + if test < 1 or test > 0x10FFFF: + self._fail_route() + else: + if this not in htmlentitydefs.entitydefs: + self._fail_route() + + self._write(tokens.Text(text=this)) + self._write(tokens.HTMLEntityEnd()) + + def _parse_entity(self): + reset = self._head + self._push() + try: + self._really_parse_entity() + except BadRoute: + self._head = reset + self._write_text(self._read()) + else: + self._write_all(self._pop()) + + def _parse(self, context=0): + self._push(context) + while True: + this = self._read() + if this not in self.MARKERS: + self._write_text(this) + self._head += 1 + continue + if this is self.END: + if self._context & (contexts.TEMPLATE | contexts.HEADING): + self._fail_route() + return self._pop() + prev, next = self._read(-1), self._read(1) + if this == next == "{": + self._parse_template() + elif this == "|" and self._context & contexts.TEMPLATE: + self._handle_template_param() + elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: + self._handle_template_param_value() + elif this == next == "}" and self._context & contexts.TEMPLATE: + return self._handle_template_end() + elif (prev == "\n" or prev == self.START) and this == "=" and not self._global & contexts.GL_HEADING: + self._parse_heading() + elif this == "=" and self._context & contexts.HEADING: + return self._handle_heading_end() + elif this == "\n" and self._context & contexts.HEADING: + self._fail_route() + elif this == "&": + self._parse_entity() + else: + self._write_text(this) + self._head += 1 + + def tokenize(self, text): + split = self.regex.split(text) + self._text = [segment for segment in split if segment] + return self._parse() diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py new file mode 100644 index 0000000..3cb73c9 --- /dev/null +++ b/mwparserfromhell/parser/tokens.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +__all__ = ["Token"] + +class Token(object): + def __init__(self, **kwargs): + super(Token, self).__setattr__("_kwargs", kwargs) + + def __repr__(self): + args = [] + for key, value in self._kwargs.iteritems(): + if isinstance(value, basestring) and len(value) > 100: + args.append(key + "=" + repr(value[:97] + "...")) + else: + args.append(key + "=" + repr(value)) + return u"{0}({1})".format(type(self).__name__, u", ".join(args)) + + def __eq__(self, other): + if isinstance(other, type(self)): + return self._kwargs == other._kwargs + return False + + def __getattr__(self, key): + return self._kwargs[key] + + def __setattr__(self, key, value): + self._kwargs[key] = value + + def __delattr__(self, key): + del self._kwargs[key] + + +def make(name): + __all__.append(name) + return type(name, (Token,), {}) + +Text = make("Text") + +TemplateOpen = make("TemplateOpen") # {{ +TemplateParamSeparator = make("TemplateParamSeparator") # | +TemplateParamEquals = make("TemplateParamEquals") # = +TemplateClose = make("TemplateClose") # }} + +HTMLEntityStart = make("HTMLEntityStart") # & +HTMLEntityNumeric = make("HTMLEntityNumeric") # # +HTMLEntityHex = make("HTMLEntityHex") # x +HTMLEntityEnd = make("HTMLEntityEnd") # ; + +HeadingStart = make("HeadingStart") # =... +HeadingEnd = make("HeadingEnd") # =... + +TagOpenOpen = make("TagOpenOpen") # < +TagAttrStart = make("TagAttrStart") +TagAttrEquals = make("TagAttrEquals") # = +TagAttrQuote = make("TagAttrQuote") # " +TagCloseOpen = make("TagCloseOpen") # > +TagCloseSelfclose = make("TagCloseSelfclose") # /> +TagOpenClose = make("TagOpenClose") # + +del make diff --git a/mwparserfromhell/smart_list.py b/mwparserfromhell/smart_list.py index 244c9f7..c6b484f 100644 --- a/mwparserfromhell/smart_list.py +++ b/mwparserfromhell/smart_list.py @@ -81,6 +81,7 @@ class SmartList(list): def __iadd__(self, other): self.extend(other) + return self def append(self, item): head = len(self) @@ -221,6 +222,7 @@ class _ListProxy(list): def __iadd__(self, other): self.extend(other) + return self @property def _start(self): diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 33084b5..9c32c10 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -22,24 +22,25 @@ import mwparserfromhell from .nodes import Node +from .smart_list import SmartList def parse_anything(value): wikicode = mwparserfromhell.wikicode.Wikicode if isinstance(value, wikicode): return value if isinstance(value, Node): - return wikicode([value]) + return wikicode(SmartList([value])) if isinstance(value, basestring): return mwparserfromhell.parse(value) if isinstance(value, int): return mwparserfromhell.parse(unicode(value)) if value is None: - return wikicode([]) + return wikicode(SmartList()) try: - nodelist = [] + nodelist = SmartList() for item in value: nodelist += parse_anything(item).nodes except TypeError: error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" - raise ValueError(error.format(type(value), value)) + raise ValueError(error.format(type(value).__name__, value)) return wikicode(nodelist) diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index cd38dcf..9631260 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -105,6 +105,10 @@ class Wikicode(StringMixIn): def nodes(self): return self._nodes + @nodes.setter + def nodes(self, value): + self._nodes = value + def get(self, index): return self.nodes[index]