From 33acb3eea3d9cedf9ca7def41499487860b2a94b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 6 Aug 2012 00:44:06 -0400 Subject: [PATCH 01/26] Starting main parser: Parser, Tokens, Tokenizer, Builder, BuildStack. --- mwparserfromhell/nodes/__init__.py | 2 +- mwparserfromhell/parser/__init__.py | 16 +++++- mwparserfromhell/parser/build_stack.py | 36 +++++++++++++ mwparserfromhell/parser/builder.py | 93 ++++++++++++++++++++++++++++++++++ mwparserfromhell/parser/demo.py | 53 ------------------- mwparserfromhell/parser/tokenizer.py | 30 +++++++++++ mwparserfromhell/parser/tokens.py | 65 ++++++++++++++++++++++++ 7 files changed, 240 insertions(+), 55 deletions(-) create mode 100644 mwparserfromhell/parser/build_stack.py create mode 100644 mwparserfromhell/parser/builder.py delete mode 100644 mwparserfromhell/parser/demo.py create mode 100644 mwparserfromhell/parser/tokenizer.py create mode 100644 mwparserfromhell/parser/tokens.py diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index f749e71..0777479 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -22,7 +22,7 @@ from ..string_mixin import StringMixIn -__all__ = ["Node"] +__all__ = ["Node", "Text", "Heading", "HTMLEntity", "Tag", "Template"] class Node(StringMixIn): def __iternodes__(self, getter): diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index c32a549..f70273f 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -20,4 +20,18 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from .demo import DemoParser as Parser +from .builder import Builder +from .tokenizer import Tokenizer + +__all__ = ["Parser"] + +class Parser(object): + def __init__(self, text): + self.text = text + self._tokenizer = Tokenizer() + self._builder = Builder() + + def parse(self): + tokens = self._tokenizer.tokenize(self.text) + code = self._builder.build(tokens) + return code diff --git a/mwparserfromhell/parser/build_stack.py b/mwparserfromhell/parser/build_stack.py new file mode 100644 index 0000000..66cc67b --- /dev/null +++ b/mwparserfromhell/parser/build_stack.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +__all__ = ["BuildStack"] + +class BuildStack(object): + def __init__(self): + pass + + def write(self, item): + pass + + def push(self): + pass + + def pop(self): + pass diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py new file mode 100644 index 0000000..cedb83f --- /dev/null +++ b/mwparserfromhell/parser/builder.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import re + +from . import tokens +from .build_stack import BuildStack +from ..nodes import Template, Text +from ..nodes.extras import Parameter +from ..smart_list import SmartList +from ..wikicode import Wikicode + +__all__ = ["Builder"] + +class Builder(object): + def __init__(self): + self._tokens = [] + self._stack = BuildStack() + + def _pop(self): + return Wikicode(SmartList(stack.pop())) + + def _handle_parameter(self, key): + showkey = False + self._stack.push() + while self._tokens: + token = self._tokens.pop(0) + if isinstance(token, tokens.TEMPLATE_PARAM_EQUALS): + key = self._pop() + showkey = True + self._stack.push() + elif isinstance(token, (tokens.TEMPLATE_PARAM_SEPARATOR, + tokens.TEMPLATE_CLOSE)): + self._tokens.insert(0, token) + value = self._pop() + return Parameter(key, value, showkey) + else: + self._stack.write(self._handle_token()) + + def _handle_template(self): + params = [] + int_keys = set() + int_key_range = {1} + self._stack.push() + while self._tokens: + token = self._tokens.pop(0) + if isinstance(token, tokens.TEMPLATE_PARAM_SEPARATOR): + if not params: + name = self._pop() + param = self._handle_parameter(min(int_key_range - int_keys)) + if re.match(r"[1-9][0-9]*$", param.key.strip()): + int_keys.add(int(param.key)) + int_key_range.add(len(int_keys) + 1) + params.append(param) + elif isinstance(token, tokens.TEMPLATE_CLOSE): + if not params: + name = self._pop() + return Template(name, params) + else: + self._stack.write(self._handle_token()) + + def _handle_token(self): + token = self._tokens.pop(0) + if isinstance(token, tokens.TEXT): + return Text(token.text) + elif isinstance(token, tokens.TEMPLATE_OPEN): + return self._handle_template() + + def build(self, tokens): + self._tokens = tokens + self._stack.push() + while self._tokens: + self._stack.write(self._handle_token()) + return self._pop() diff --git a/mwparserfromhell/parser/demo.py b/mwparserfromhell/parser/demo.py deleted file mode 100644 index f8ce479..0000000 --- a/mwparserfromhell/parser/demo.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2012 Ben Kurtovic -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -from ..nodes import Template, Text -from ..nodes.extras import Parameter -from ..smart_list import SmartList -from ..wikicode import Wikicode - -__all__ = ["DemoParser"] - -class DemoParser(object): - def __init__(self, text): - self.text = text - - def _tokenize(self): - return [] - - def parse(self): - # Ensure text is unicode! - text = u"This is a {{test}} message with a {{template|with|foo={{params}}}}." - - node1 = Text(u"This is a ") - node2 = Template(Wikicode([Text(u"test")])) - node3 = Text(u" message with a ") - node4_param1_name = Wikicode([Text(u"1")]) - node4_param1_value = Wikicode([Text(u"with")]) - node4_param1 = Parameter(node4_param1_name, node4_param1_value, showkey=False) - node4_param2_name = Wikicode([Text(u"foo")]) - node4_param2_value = Wikicode([Template(Wikicode([Text(u"params")]))]) - node4_param2 = Parameter(node4_param2_name, node4_param2_value, showkey=True) - node4 = Template(Wikicode([Text(u"template")]), [node4_param1, node4_param2]) - node5 = Text(u".") - parsed = Wikicode(SmartList([node1, node2, node3, node4, node5])) - return parsed diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py new file mode 100644 index 0000000..0417489 --- /dev/null +++ b/mwparserfromhell/parser/tokenizer.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from . import tokens + +__all__ = ["Tokenizer"] + +class Tokenizer(object): + def tokenize(self, text): + tokens = [tokens.Text(text=text)] + return tokens diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py new file mode 100644 index 0000000..6228f7d --- /dev/null +++ b/mwparserfromhell/parser/tokens.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +__all__ = ["Token"] + +class Token(object): + def __init__(self, **kwargs): + self.__kwargs = kwargs + + def __getattr__(self, key): + return self.__kwargs[key] + + def __setattr__(self, key, value): + self.__kwargs[key] = value + + def __delattr__(self, key): + del self.__kwargs[key] + + +def make(name): + __all__.append(name) + return type(name, (Token,), {}) + +TEXT = make("TEXT") + +TEMPLATE_OPEN = make("TEMPLATE_OPEN") # {{ +TEMPLATE_PARAM_SEPARATOR = make("TEMPLATE_PARAM_SEPARATOR") # | +TEMPLATE_PARAM_EQUALS = make("TEMPLATE_PARAM_EQUALS") # = +TEMPLATE_CLOSE = make("TEMPLATE_CLOSE") # }} + +HTML_ENTITY_START = make("HTML_ENTITY_START") # & +HTML_ENTITY_NUMERIC = make("HTML_ENTITY_NUMERIC") # # +HTML_ENTITY_HEX = make("HTML_ENTITY_HEX") # x +HTML_ENTITY_END = make("HTML_ENTITY_END") # ; + +HEADING_BLOCK = make("HEADING_BLOCK") # =... + +TAG_OPEN_OPEN = make("TAG_OPEN_OPEN") # < +TAG_ATTR_EQUALS = make("TAG_ATTR_EQUALS") # = +TAG_ATTR_QUOTE = make("TAG_ATTR_QUOTE") # " +TAG_CLOSE_OPEN = make("TAG_CLOSE_OPEN") # > +TAG_CLOSE_SELFCLOSE = make("TAG_CLOSE_SELFCLOSE") # /> +TAG_OPEN_CLOSE = make("TAG_OPEN_CLOSE") # + +del make From 219b9086d200d84675da61f27e583883e4f640de Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 6 Aug 2012 21:49:17 -0400 Subject: [PATCH 02/26] Finish implementing Builder for all existing Nodes; BuildStack. --- mwparserfromhell/nodes/heading.py | 2 +- mwparserfromhell/nodes/tag.py | 2 +- mwparserfromhell/parser/build_stack.py | 11 +++-- mwparserfromhell/parser/builder.py | 90 +++++++++++++++++++++++++++++----- mwparserfromhell/parser/tokens.py | 1 + 5 files changed, 87 insertions(+), 19 deletions(-) diff --git a/mwparserfromhell/nodes/heading.py b/mwparserfromhell/nodes/heading.py index ee10a9f..1f0ca0f 100644 --- a/mwparserfromhell/nodes/heading.py +++ b/mwparserfromhell/nodes/heading.py @@ -30,7 +30,7 @@ class Heading(Node): self._level = level def __unicode__(self): - return ("=" * self.level) + self.title + ("=" * self.level) + return ("=" * self.level) + unicode(self.title) + ("=" * self.level) def __iternodes__(self, getter): yield None, self diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index d80536b..cd4fd8c 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -67,7 +67,7 @@ class Tag(Node): TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE - def __init__(self, type_, tag, contents, attrs=None, showtag=True, + def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, self_closing=False, open_padding=0, close_padding=0): self._type = type_ self._tag = tag diff --git a/mwparserfromhell/parser/build_stack.py b/mwparserfromhell/parser/build_stack.py index 66cc67b..23061d0 100644 --- a/mwparserfromhell/parser/build_stack.py +++ b/mwparserfromhell/parser/build_stack.py @@ -20,17 +20,20 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from ..smart_list import SmartList +from ..wikicode import Wikicode + __all__ = ["BuildStack"] class BuildStack(object): def __init__(self): - pass + self._stacks = [] def write(self, item): - pass + self._stacks[-1].append(item) def push(self): - pass + self._stacks.append([]) def pop(self): - pass + return Wikicode(SmartList(self._stacks.pop())) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index cedb83f..c53609d 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -24,10 +24,8 @@ import re from . import tokens from .build_stack import BuildStack -from ..nodes import Template, Text -from ..nodes.extras import Parameter -from ..smart_list import SmartList -from ..wikicode import Wikicode +from ..nodes import Heading, HTMLEntity, Tag, Template, Text +from ..nodes.extras import Attribute, Parameter __all__ = ["Builder"] @@ -36,22 +34,19 @@ class Builder(object): self._tokens = [] self._stack = BuildStack() - def _pop(self): - return Wikicode(SmartList(stack.pop())) - def _handle_parameter(self, key): showkey = False self._stack.push() while self._tokens: token = self._tokens.pop(0) if isinstance(token, tokens.TEMPLATE_PARAM_EQUALS): - key = self._pop() + key = self._stack.pop() showkey = True self._stack.push() elif isinstance(token, (tokens.TEMPLATE_PARAM_SEPARATOR, - tokens.TEMPLATE_CLOSE)): + tokens.TEMPLATE_CLOSE)): self._tokens.insert(0, token) - value = self._pop() + value = self._stack.pop() return Parameter(key, value, showkey) else: self._stack.write(self._handle_token()) @@ -65,7 +60,7 @@ class Builder(object): token = self._tokens.pop(0) if isinstance(token, tokens.TEMPLATE_PARAM_SEPARATOR): if not params: - name = self._pop() + name = self._stack.pop() param = self._handle_parameter(min(int_key_range - int_keys)) if re.match(r"[1-9][0-9]*$", param.key.strip()): int_keys.add(int(param.key)) @@ -73,21 +68,90 @@ class Builder(object): params.append(param) elif isinstance(token, tokens.TEMPLATE_CLOSE): if not params: - name = self._pop() + name = self._stack.pop() return Template(name, params) else: self._stack.write(self._handle_token()) + def _handle_entity(self): + token = self._tokens.pop(0) + if isinstance(token, tokens.HTML_ENTITY_NUMERIC): + token = self._tokens.pop(0) + if isinstance(token, tokens.HTML_ENTITY_HEX): + token = self._tokens.pop(0) + return HTMLEntity(token.text, named=False, hexadecimal=True) + return HTMLEntity(token.text, named=False, hexadecimal=False) + return HTMLEntity(token.text, named=True, hexadecimal=False) + + def _handle_heading(self, token): + level = token.level + self._stack.push() + while self._tokens: + token = self._tokens.pop(0) + if isinstance(token, tokens.HEADING_BLOCK): + title = self._stack.pop() + return Heading(title, level) + else: + self._stack.write(self._handle_token()) + + def _handle_attribute(self): + name, quoted = None, False + self._stack.push() + while self._tokens: + token = self._tokens.pop(0) + if isinstance(token, tokens.TAG_ATTR_EQUALS): + name = self._stack.pop() + self._stack.push() + elif isinstance(token, tokens.TAG_ATTR_QUOTE): + quoted = True + elif isinstance(token, (tokens.TAG_ATTR_START, + tokens.TAG_CLOSE_OPEN)): + self._tokens.insert(0, token) + if name is not None: + return Attribute(name, self._stack.pop(), quoted) + return Attribute(self._stack.pop(), quoted=quoted) + else: + self._stack.write(self._handle_token()) + + def _handle_tag(self, token): + type_, showtag, attrs = token.type, token.showtag, attrs + self._stack.push() + while self._tokens: + token = self._tokens.pop(0) + if isinstance(token, tokens.TAG_ATTR_START): + attrs.append(self._handle_attribute()) + elif isinstance(token, tokens.TAG_CLOSE_OPEN): + open_pad = token.padding + tag = self._stack.pop() + self._stack.push() + elif isinstance(token, tokens.TAG_CLOSE_SELFCLOSE): + tag = self._stack.pop() + return Tag(type_, tag, attrs=attrs, showtag=showtag, + self_closing=True, open_padding=token.padding) + elif isinstance(token, tokens.TAG_OPEN_CLOSE): + contents = self._stack.pop() + elif isinstance(token, tokens.TAG_CLOSE_CLOSE): + return Tag(type_, tag, contents, attrs, showtag, self_closing, + open_pad, token.padding) + else: + self._stack.write(self._handle_token()) + def _handle_token(self): token = self._tokens.pop(0) if isinstance(token, tokens.TEXT): return Text(token.text) elif isinstance(token, tokens.TEMPLATE_OPEN): return self._handle_template() + elif isinstance(token, tokens.HTML_ENTITY_START): + return self._handle_entity() + elif isinstance(token, tokens.HEADING_BLOCK): + return self._handle_heading(token) + elif isinstance(token, tokens.TAG_OPEN_OPEN): + return self._handle_tag(token) def build(self, tokens): self._tokens = tokens self._stack.push() while self._tokens: self._stack.write(self._handle_token()) - return self._pop() + return self._stack.pop() diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 6228f7d..05dbcc9 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -55,6 +55,7 @@ HTML_ENTITY_END = make("HTML_ENTITY_END") # ; HEADING_BLOCK = make("HEADING_BLOCK") # =... TAG_OPEN_OPEN = make("TAG_OPEN_OPEN") # < +TAG_ATTR_START = make("TAG_ATTR_START") TAG_ATTR_EQUALS = make("TAG_ATTR_EQUALS") # = TAG_ATTR_QUOTE = make("TAG_ATTR_QUOTE") # " TAG_CLOSE_OPEN = make("TAG_CLOSE_OPEN") # > From 8f0782f9157f9baf9c6a70f5270e0f39710352f6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 6 Aug 2012 21:55:45 -0400 Subject: [PATCH 03/26] Fix some typos/bugs. --- mwparserfromhell/parser/builder.py | 13 +++++++------ mwparserfromhell/parser/tokenizer.py | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index c53609d..5e8aaf5 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -62,8 +62,8 @@ class Builder(object): if not params: name = self._stack.pop() param = self._handle_parameter(min(int_key_range - int_keys)) - if re.match(r"[1-9][0-9]*$", param.key.strip()): - int_keys.add(int(param.key)) + if re.match(r"[1-9][0-9]*$", param.name.strip()): + int_keys.add(int(param.name)) int_key_range.add(len(int_keys) + 1) params.append(param) elif isinstance(token, tokens.TEMPLATE_CLOSE): @@ -114,7 +114,8 @@ class Builder(object): self._stack.write(self._handle_token()) def _handle_tag(self, token): - type_, showtag, attrs = token.type, token.showtag, attrs + type_, showtag = token.type, token.showtag + attrs = [] self._stack.push() while self._tokens: token = self._tokens.pop(0) @@ -131,7 +132,7 @@ class Builder(object): elif isinstance(token, tokens.TAG_OPEN_CLOSE): contents = self._stack.pop() elif isinstance(token, tokens.TAG_CLOSE_CLOSE): - return Tag(type_, tag, contents, attrs, showtag, self_closing, + return Tag(type_, tag, contents, attrs, showtag, False, open_pad, token.padding) else: self._stack.write(self._handle_token()) @@ -149,8 +150,8 @@ class Builder(object): elif isinstance(token, tokens.TAG_OPEN_OPEN): return self._handle_tag(token) - def build(self, tokens): - self._tokens = tokens + def build(self, tokenlist): + self._tokens = tokenlist self._stack.push() while self._tokens: self._stack.write(self._handle_token()) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 0417489..dbdc48b 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,5 +26,5 @@ __all__ = ["Tokenizer"] class Tokenizer(object): def tokenize(self, text): - tokens = [tokens.Text(text=text)] - return tokens + tokenized = [tokens.TEXT(text=text)] + return tokenized From 4539859c55aa6a1058900d4a02234085f18f3726 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 7 Aug 2012 01:56:16 -0400 Subject: [PATCH 04/26] Merge BuildStack into Builder. --- mwparserfromhell/parser/build_stack.py | 39 ----------------------- mwparserfromhell/parser/builder.py | 56 ++++++++++++++++++++-------------- mwparserfromhell/parser/tokenizer.py | 10 ++++-- 3 files changed, 41 insertions(+), 64 deletions(-) delete mode 100644 mwparserfromhell/parser/build_stack.py diff --git a/mwparserfromhell/parser/build_stack.py b/mwparserfromhell/parser/build_stack.py deleted file mode 100644 index 23061d0..0000000 --- a/mwparserfromhell/parser/build_stack.py +++ /dev/null @@ -1,39 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2012 Ben Kurtovic -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -from ..smart_list import SmartList -from ..wikicode import Wikicode - -__all__ = ["BuildStack"] - -class BuildStack(object): - def __init__(self): - self._stacks = [] - - def write(self, item): - self._stacks[-1].append(item) - - def push(self): - self._stacks.append([]) - - def pop(self): - return Wikicode(SmartList(self._stacks.pop())) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 5e8aaf5..3b6a643 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -23,30 +23,40 @@ import re from . import tokens -from .build_stack import BuildStack from ..nodes import Heading, HTMLEntity, Tag, Template, Text from ..nodes.extras import Attribute, Parameter +from ..smart_list import SmartList +from ..wikicode import Wikicode __all__ = ["Builder"] class Builder(object): def __init__(self): self._tokens = [] - self._stack = BuildStack() + self._stacks = [] + + def _push(self): + self._stacks.append([]) + + def _pop(self): + return Wikicode(SmartList(self._stacks.pop())) + + def _write(self, item): + self._stacks[-1].append(item) def _handle_parameter(self, key): showkey = False - self._stack.push() + self._push() while self._tokens: token = self._tokens.pop(0) if isinstance(token, tokens.TEMPLATE_PARAM_EQUALS): - key = self._stack.pop() + key = self._pop() showkey = True - self._stack.push() + self._push() elif isinstance(token, (tokens.TEMPLATE_PARAM_SEPARATOR, tokens.TEMPLATE_CLOSE)): self._tokens.insert(0, token) - value = self._stack.pop() + value = self._pop() return Parameter(key, value, showkey) else: self._stack.write(self._handle_token()) @@ -55,12 +65,12 @@ class Builder(object): params = [] int_keys = set() int_key_range = {1} - self._stack.push() + self._push() while self._tokens: token = self._tokens.pop(0) if isinstance(token, tokens.TEMPLATE_PARAM_SEPARATOR): if not params: - name = self._stack.pop() + name = self._pop() param = self._handle_parameter(min(int_key_range - int_keys)) if re.match(r"[1-9][0-9]*$", param.name.strip()): int_keys.add(int(param.name)) @@ -68,7 +78,7 @@ class Builder(object): params.append(param) elif isinstance(token, tokens.TEMPLATE_CLOSE): if not params: - name = self._stack.pop() + name = self._pop() return Template(name, params) else: self._stack.write(self._handle_token()) @@ -85,52 +95,52 @@ class Builder(object): def _handle_heading(self, token): level = token.level - self._stack.push() + self._push() while self._tokens: token = self._tokens.pop(0) if isinstance(token, tokens.HEADING_BLOCK): - title = self._stack.pop() + title = self._pop() return Heading(title, level) else: self._stack.write(self._handle_token()) def _handle_attribute(self): name, quoted = None, False - self._stack.push() + self._push() while self._tokens: token = self._tokens.pop(0) if isinstance(token, tokens.TAG_ATTR_EQUALS): - name = self._stack.pop() - self._stack.push() + name = self._pop() + self._push() elif isinstance(token, tokens.TAG_ATTR_QUOTE): quoted = True elif isinstance(token, (tokens.TAG_ATTR_START, tokens.TAG_CLOSE_OPEN)): self._tokens.insert(0, token) if name is not None: - return Attribute(name, self._stack.pop(), quoted) - return Attribute(self._stack.pop(), quoted=quoted) + return Attribute(name, self._pop(), quoted) + return Attribute(self._pop(), quoted=quoted) else: self._stack.write(self._handle_token()) def _handle_tag(self, token): type_, showtag = token.type, token.showtag attrs = [] - self._stack.push() + self._push() while self._tokens: token = self._tokens.pop(0) if isinstance(token, tokens.TAG_ATTR_START): attrs.append(self._handle_attribute()) elif isinstance(token, tokens.TAG_CLOSE_OPEN): open_pad = token.padding - tag = self._stack.pop() - self._stack.push() + tag = self._pop() + self._push() elif isinstance(token, tokens.TAG_CLOSE_SELFCLOSE): - tag = self._stack.pop() + tag = self._pop() return Tag(type_, tag, attrs=attrs, showtag=showtag, self_closing=True, open_padding=token.padding) elif isinstance(token, tokens.TAG_OPEN_CLOSE): - contents = self._stack.pop() + contents = self._pop() elif isinstance(token, tokens.TAG_CLOSE_CLOSE): return Tag(type_, tag, contents, attrs, showtag, False, open_pad, token.padding) @@ -152,7 +162,7 @@ class Builder(object): def build(self, tokenlist): self._tokens = tokenlist - self._stack.push() + self._push() while self._tokens: self._stack.write(self._handle_token()) - return self._stack.pop() + return self._pop() diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index dbdc48b..10b4d8a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -25,6 +25,12 @@ from . import tokens __all__ = ["Tokenizer"] class Tokenizer(object): + def __init__(self): + self._text = None + self._head = 0 + self._tokens = [] + def tokenize(self, text): - tokenized = [tokens.TEXT(text=text)] - return tokenized + self._text = text + self._tokens.append(tokens.TEXT(text=text)) + return self._tokens From 81e5ce30af6f4dbde1e4a426ae0c5eebe2b0ca15 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 13 Aug 2012 19:43:06 -0400 Subject: [PATCH 05/26] Working on the framework for the tokenizer, plus some cleanup, fixes. --- mwparserfromhell/nodes/template.py | 2 +- mwparserfromhell/parser/__init__.py | 8 ++++-- mwparserfromhell/parser/builder.py | 56 ++++++++++++++++++------------------ mwparserfromhell/parser/tokenizer.py | 47 +++++++++++++++++++++++++++--- mwparserfromhell/parser/tokens.py | 44 ++++++++++++++-------------- 5 files changed, 100 insertions(+), 57 deletions(-) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 0b65aa7..d77388f 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -90,7 +90,7 @@ class Template(Node): before_theories = defaultdict(lambda: 0) after_theories = defaultdict(lambda: 0) for param in self.params: - match = re.search("^(\s*).*?(\s*)$", unicode(param.value), FLAGS) + match = re.search(r"^(\s*).*?(\s*)$", unicode(param.value), FLAGS) before, after = match.group(1), match.group(2) before_theories[before] += 1 after_theories[after] += 1 diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index f70273f..49ea940 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -20,8 +20,12 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from .builder import Builder -from .tokenizer import Tokenizer +try: + from ._builder import CBuilder as Builder + from ._tokenizer import CTokenizer as Tokenizer +except ImportError: + from .builder import Builder + from .tokenizer import Tokenizer __all__ = ["Parser"] diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 3b6a643..80354a9 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -49,17 +49,17 @@ class Builder(object): self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.TEMPLATE_PARAM_EQUALS): + if isinstance(token, tokens.TemplateParamEquals): key = self._pop() showkey = True self._push() - elif isinstance(token, (tokens.TEMPLATE_PARAM_SEPARATOR, - tokens.TEMPLATE_CLOSE)): + elif isinstance(token, (tokens.TemplateParamSeparator, + tokens.TemplateClose)): self._tokens.insert(0, token) value = self._pop() return Parameter(key, value, showkey) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_template(self): params = [] @@ -68,7 +68,7 @@ class Builder(object): self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.TEMPLATE_PARAM_SEPARATOR): + if isinstance(token, tokens.TemplateParamSeparator): if not params: name = self._pop() param = self._handle_parameter(min(int_key_range - int_keys)) @@ -76,18 +76,18 @@ class Builder(object): int_keys.add(int(param.name)) int_key_range.add(len(int_keys) + 1) params.append(param) - elif isinstance(token, tokens.TEMPLATE_CLOSE): + elif isinstance(token, tokens.TemplateClose): if not params: name = self._pop() return Template(name, params) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_entity(self): token = self._tokens.pop(0) - if isinstance(token, tokens.HTML_ENTITY_NUMERIC): + if isinstance(token, tokens.HTMLEntityNumeric): token = self._tokens.pop(0) - if isinstance(token, tokens.HTML_ENTITY_HEX): + if isinstance(token, tokens.HTMLEntityHex): token = self._tokens.pop(0) return HTMLEntity(token.text, named=False, hexadecimal=True) return HTMLEntity(token.text, named=False, hexadecimal=False) @@ -98,30 +98,30 @@ class Builder(object): self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.HEADING_BLOCK): + if isinstance(token, tokens.HeadingBlock): title = self._pop() return Heading(title, level) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_attribute(self): name, quoted = None, False self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.TAG_ATTR_EQUALS): + if isinstance(token, tokens.TagAttrEquals): name = self._pop() self._push() - elif isinstance(token, tokens.TAG_ATTR_QUOTE): + elif isinstance(token, tokens.TagAttrQuote): quoted = True - elif isinstance(token, (tokens.TAG_ATTR_START, - tokens.TAG_CLOSE_OPEN)): + elif isinstance(token, (tokens.TagAttrStart, + tokens.TagCloseOpen)): self._tokens.insert(0, token) if name is not None: return Attribute(name, self._pop(), quoted) return Attribute(self._pop(), quoted=quoted) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_tag(self, token): type_, showtag = token.type, token.showtag @@ -129,40 +129,40 @@ class Builder(object): self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.TAG_ATTR_START): + if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute()) - elif isinstance(token, tokens.TAG_CLOSE_OPEN): + elif isinstance(token, tokens.TagCloseOpen): open_pad = token.padding tag = self._pop() self._push() - elif isinstance(token, tokens.TAG_CLOSE_SELFCLOSE): + elif isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() return Tag(type_, tag, attrs=attrs, showtag=showtag, self_closing=True, open_padding=token.padding) - elif isinstance(token, tokens.TAG_OPEN_CLOSE): + elif isinstance(token, tokens.TagOpenClose): contents = self._pop() - elif isinstance(token, tokens.TAG_CLOSE_CLOSE): + elif isinstance(token, tokens.TagCloseClose): return Tag(type_, tag, contents, attrs, showtag, False, open_pad, token.padding) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_token(self): token = self._tokens.pop(0) - if isinstance(token, tokens.TEXT): + if isinstance(token, tokens.Text): return Text(token.text) - elif isinstance(token, tokens.TEMPLATE_OPEN): + elif isinstance(token, tokens.TemplateOpen): return self._handle_template() - elif isinstance(token, tokens.HTML_ENTITY_START): + elif isinstance(token, tokens.HTMLEntityStart): return self._handle_entity() - elif isinstance(token, tokens.HEADING_BLOCK): + elif isinstance(token, tokens.HeadingBlock): return self._handle_heading(token) - elif isinstance(token, tokens.TAG_OPEN_OPEN): + elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) def build(self, tokenlist): self._tokens = tokenlist self._push() while self._tokens: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) return self._pop() diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 10b4d8a..36c4517 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -25,12 +25,51 @@ from . import tokens __all__ = ["Tokenizer"] class Tokenizer(object): + START = object() + END = object() + def __init__(self): self._text = None self._head = 0 - self._tokens = [] + self._stacks = [] + + self._modifiers = [] + + def _push(self): + self._stacks.append([]) + + def _pop(self): + return self._stacks.pop() + + def _write(self, token, stack=None): + if stack is None: + stack = self._stacks[-1] + if not stack: + stack.append(token) + return + last = stack[-1] + if isinstance(token, tokens.Text) and isinstance(last, tokens.Text): + last.text += token.text + else: + stack.append(token) + + def _read(self, delta=0, wrap=False): + index = self._head + delta + if index < 0 and (not wrap or abs(index) > len(self._text)): + return self.START + if index >= len(self._text): + return self.END + return self._text[index] + + def _parse_until(self, stop): + self._push() + while True: + if self._read() in (stop, self.END): + return self._pop() + else: + self._write(tokens.Text(text=self._read())) + self._head += 1 def tokenize(self, text): - self._text = text - self._tokens.append(tokens.TEXT(text=text)) - return self._tokens + self._text = list(text) + return self._parse_until(stop=self.END) diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 05dbcc9..322b801 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -24,43 +24,43 @@ __all__ = ["Token"] class Token(object): def __init__(self, **kwargs): - self.__kwargs = kwargs + super(Token, self).__setattr__("_kwargs", kwargs) def __getattr__(self, key): - return self.__kwargs[key] + return self._kwargs[key] def __setattr__(self, key, value): - self.__kwargs[key] = value + self._kwargs[key] = value def __delattr__(self, key): - del self.__kwargs[key] + del self._kwargs[key] def make(name): __all__.append(name) return type(name, (Token,), {}) -TEXT = make("TEXT") +Text = make("Text") -TEMPLATE_OPEN = make("TEMPLATE_OPEN") # {{ -TEMPLATE_PARAM_SEPARATOR = make("TEMPLATE_PARAM_SEPARATOR") # | -TEMPLATE_PARAM_EQUALS = make("TEMPLATE_PARAM_EQUALS") # = -TEMPLATE_CLOSE = make("TEMPLATE_CLOSE") # }} +TemplateOpen = make("TemplateOpen") # {{ +TemplateParamSeparator = make("TemplateParamSeparator") # | +TemplateParamEquals = make("TemplateParamEquals") # = +TemplateClose = make("TemplateClose") # }} -HTML_ENTITY_START = make("HTML_ENTITY_START") # & -HTML_ENTITY_NUMERIC = make("HTML_ENTITY_NUMERIC") # # -HTML_ENTITY_HEX = make("HTML_ENTITY_HEX") # x -HTML_ENTITY_END = make("HTML_ENTITY_END") # ; +HTMLEntityStart = make("HTMLEntityStart") # & +HTMLEntityNumeric = make("HTMLEntityNumeric") # # +HTMLEntityHex = make("HTMLEntityHex") # X +HTMLEntityEnd = make("HTMLEntityEnd") # ; -HEADING_BLOCK = make("HEADING_BLOCK") # =... +HeadingBlock = make("HeadingBlock") # =... -TAG_OPEN_OPEN = make("TAG_OPEN_OPEN") # < -TAG_ATTR_START = make("TAG_ATTR_START") -TAG_ATTR_EQUALS = make("TAG_ATTR_EQUALS") # = -TAG_ATTR_QUOTE = make("TAG_ATTR_QUOTE") # " -TAG_CLOSE_OPEN = make("TAG_CLOSE_OPEN") # > -TAG_CLOSE_SELFCLOSE = make("TAG_CLOSE_SELFCLOSE") # /> -TAG_OPEN_CLOSE = make("TAG_OPEN_CLOSE") # +TagOpenOpen = make("TagOpenOpen") # < +TagAttrStart = make("TagAttrStart") +TagAttrEquals = make("TagAttrEquals") # = +TagAttrQuote = make("TagAttrQuote") # " +TagCloseOpen = make("TagCloseOpen") # > +TagCloseSelfclose = make("TagCloseSelfclose") # /> +TagOpenClose = make("TagOpenClose") # del make From 17053e47019979c9ea2d0c2d0aba97d96e15b71a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 13 Aug 2012 19:53:49 -0400 Subject: [PATCH 06/26] Support &#Xhex; in addition to &#xhex;. --- mwparserfromhell/nodes/html_entity.py | 9 +++++++-- mwparserfromhell/parser/builder.py | 5 +++-- mwparserfromhell/parser/tokens.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index af046ea..8ba2cf6 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -27,7 +27,7 @@ from . import Node __all__ = ["HTMLEntity"] class HTMLEntity(Node): - def __init__(self, value, named=None, hexadecimal=False): + def __init__(self, value, named=None, hexadecimal=False, hex_char="x"): self._value = value if named is None: # Try to guess whether or not the entity is named try: @@ -45,12 +45,13 @@ class HTMLEntity(Node): else: self._named = named self._hexadecimal = hexadecimal + self._hex_char = hex_char def __unicode__(self): if self.named: return u"&{0};".format(self.value) if self.hexadecimal: - return u"&#x{0};".format(self.value) + return u"&#{0}{1};".format(self.hex_char, self.value) return u"&#{0};".format(self.value) def __strip__(self, normalize, collapse): @@ -93,6 +94,10 @@ class HTMLEntity(Node): def hexadecimal(self): return self._hexadecimal + @property + def hex_char(self): + return self._hex_char + def normalize(self): if self.named: return unichr(htmlentitydefs.name2codepoint[self.value]) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 80354a9..9ac6a70 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -88,8 +88,9 @@ class Builder(object): if isinstance(token, tokens.HTMLEntityNumeric): token = self._tokens.pop(0) if isinstance(token, tokens.HTMLEntityHex): - token = self._tokens.pop(0) - return HTMLEntity(token.text, named=False, hexadecimal=True) + text = self._tokens.pop(0) + return HTMLEntity(text.text, named=False, hexadecimal=True, + hex_char=token.char) return HTMLEntity(token.text, named=False, hexadecimal=False) return HTMLEntity(token.text, named=True, hexadecimal=False) diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 322b801..6c77a5f 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -49,7 +49,7 @@ TemplateClose = make("TemplateClose") # }} HTMLEntityStart = make("HTMLEntityStart") # & HTMLEntityNumeric = make("HTMLEntityNumeric") # # -HTMLEntityHex = make("HTMLEntityHex") # X +HTMLEntityHex = make("HTMLEntityHex") # x HTMLEntityEnd = make("HTMLEntityEnd") # ; HeadingBlock = make("HeadingBlock") # =... From 32d99c3c75be16f9e190a176486c06dc35131f18 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 14 Aug 2012 03:06:25 -0400 Subject: [PATCH 07/26] Tokenizer now supports a very, very limited template syntax. --- mwparserfromhell/parser/builder.py | 16 ++++++++-------- mwparserfromhell/parser/tokenizer.py | 17 ++++++++++++++++- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 9ac6a70..715aa8e 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -59,7 +59,7 @@ class Builder(object): value = self._pop() return Parameter(key, value, showkey) else: - self._write(self._handle_token()) + self._write(self._handle_token(token)) def _handle_template(self): params = [] @@ -81,7 +81,7 @@ class Builder(object): name = self._pop() return Template(name, params) else: - self._write(self._handle_token()) + self._write(self._handle_token(token)) def _handle_entity(self): token = self._tokens.pop(0) @@ -103,7 +103,7 @@ class Builder(object): title = self._pop() return Heading(title, level) else: - self._write(self._handle_token()) + self._write(self._handle_token(token)) def _handle_attribute(self): name, quoted = None, False @@ -122,7 +122,7 @@ class Builder(object): return Attribute(name, self._pop(), quoted) return Attribute(self._pop(), quoted=quoted) else: - self._write(self._handle_token()) + self._write(self._handle_token(token)) def _handle_tag(self, token): type_, showtag = token.type, token.showtag @@ -146,10 +146,9 @@ class Builder(object): return Tag(type_, tag, contents, attrs, showtag, False, open_pad, token.padding) else: - self._write(self._handle_token()) + self._write(self._handle_token(token)) - def _handle_token(self): - token = self._tokens.pop(0) + def _handle_token(self, token): if isinstance(token, tokens.Text): return Text(token.text) elif isinstance(token, tokens.TemplateOpen): @@ -165,5 +164,6 @@ class Builder(object): self._tokens = tokenlist self._push() while self._tokens: - self._write(self._handle_token()) + node = self._handle_token(self._tokens.pop(0)) + self._write(node) return self._pop() diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 36c4517..d7128a8 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -24,6 +24,9 @@ from . import tokens __all__ = ["Tokenizer"] +class BadRoute(Exception): + pass + class Tokenizer(object): START = object() END = object() @@ -33,7 +36,7 @@ class Tokenizer(object): self._head = 0 self._stacks = [] - self._modifiers = [] + self._context = [] def _push(self): self._stacks.append([]) @@ -66,6 +69,18 @@ class Tokenizer(object): while True: if self._read() in (stop, self.END): return self._pop() + elif self._read(0) == "{" and self._read(1) == "{": + reset = self._head + self._head += 2 + try: + template = self._parse_until("}") + except BadRoute: + self._head = reset + self._write(tokens.Text(text=self._read())) + else: + self._write(tokens.TemplateOpen()) + self._stacks[-1] += template + self._write(tokens.TemplateClose()) else: self._write(tokens.Text(text=self._read())) self._head += 1 From 2209dfc78d93b3563bec67f75c85be7e0eb0c2c6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 14 Aug 2012 03:18:27 -0400 Subject: [PATCH 08/26] Better handling of template ends, refactor. --- mwparserfromhell/parser/tokenizer.py | 40 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d7128a8..6318337 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -64,23 +64,35 @@ class Tokenizer(object): return self.END return self._text[index] - def _parse_until(self, stop): + def _parse_template(self): + reset = self._head + self._head += 2 + try: + template = self._parse_until("}}") + except BadRoute: + self._head = reset + self._write(tokens.Text(text=self._read())) + else: + self._write(tokens.TemplateOpen()) + self._stacks[-1] += template + self._write(tokens.TemplateClose()) + + def _parse_until(self, stop=None): self._push() while True: - if self._read() in (stop, self.END): + if self._read() is self.END: return self._pop() - elif self._read(0) == "{" and self._read(1) == "{": - reset = self._head - self._head += 2 - try: - template = self._parse_until("}") - except BadRoute: - self._head = reset - self._write(tokens.Text(text=self._read())) - else: - self._write(tokens.TemplateOpen()) - self._stacks[-1] += template - self._write(tokens.TemplateClose()) + try: + iter(stop) + except TypeError: + if self._read() is stop: + return self._pop() + else: + if all([self._read(i) == stop[i] for i in xrange(len(stop))]): + self._head += len(stop) - 1 + return self._pop() + if self._read(0) == "{" and self._read(1) == "{": + self._parse_template() else: self._write(tokens.Text(text=self._read())) self._head += 1 From d3ea962d271b2079ba840de33253962f8e0c4433 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 14 Aug 2012 18:27:29 -0400 Subject: [PATCH 09/26] Starting work on token contexts. --- mwparserfromhell/parser/contexts.py | 26 ++++++++++++++++++++++ mwparserfromhell/parser/tokenizer.py | 43 +++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 13 deletions(-) create mode 100644 mwparserfromhell/parser/contexts.py diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py new file mode 100644 index 0000000..f966a1b --- /dev/null +++ b/mwparserfromhell/parser/contexts.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +TEMPLATE = 0b111 +TEMPLATE_NAME = 0b001 +TEMPLATE_PARAM_KEY = 0b010 +TEMPLATE_PARAM_VALUE = 0b100 diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 6318337..260a5b1 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from . import contexts from . import tokens __all__ = ["Tokenizer"] @@ -35,8 +36,7 @@ class Tokenizer(object): self._text = None self._head = 0 self._stacks = [] - - self._context = [] + self._context = 0 def _push(self): self._stacks.append([]) @@ -64,9 +64,29 @@ class Tokenizer(object): return self.END return self._text[index] + def _verify_context(self): + if self._read() is self.END: + if self._context & contexts.INSIDE_TEMPLATE: + raise BadRoute() + + def _catch_stop(self, stop): + if self._read() is self.END: + return True + try: + iter(stop) + except TypeError: + if self._read() is stop: + return True + else: + if all([self._read(i) == stop[i] for i in xrange(len(stop))]): + self._head += len(stop) - 1 + return True + return False + def _parse_template(self): reset = self._head self._head += 2 + self._context |= contexts.TEMPLATE_NAME try: template = self._parse_until("}}") except BadRoute: @@ -77,20 +97,17 @@ class Tokenizer(object): self._stacks[-1] += template self._write(tokens.TemplateClose()) - def _parse_until(self, stop=None): + ending = (contexts.TEMPLATE_NAME, contexts.TEMPLATE_PARAM_KEY, + contexts.TEMPLATE_PARAM_VALUE) + for context in ending: + self._context ^= context if self._context & context else 0 + + def _parse_until(self, stop): self._push() while True: - if self._read() is self.END: + self._verify_context() + if self._catch_stop(stop): return self._pop() - try: - iter(stop) - except TypeError: - if self._read() is stop: - return self._pop() - else: - if all([self._read(i) == stop[i] for i in xrange(len(stop))]): - self._head += len(stop) - 1 - return self._pop() if self._read(0) == "{" and self._read(1) == "{": self._parse_template() else: From 934b1ef016e3bc062b4956184cf48026f4127a6a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 14 Aug 2012 20:32:02 -0400 Subject: [PATCH 10/26] Fixes and improvements. --- mwparserfromhell/nodes/template.py | 14 +++++++------- mwparserfromhell/parser/builder.py | 14 ++++++++++---- mwparserfromhell/parser/tokenizer.py | 9 ++++++++- mwparserfromhell/smart_list.py | 2 ++ mwparserfromhell/utils.py | 9 +++++---- mwparserfromhell/wikicode.py | 4 ++++ 6 files changed, 36 insertions(+), 16 deletions(-) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index d77388f..772a89a 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -130,10 +130,10 @@ class Template(Node): if self.has_param(name): self.remove(name, keep_field=True) existing = self.get(name) - if showkey is None: # Infer showkey from current value - showkey = existing.showkey - if not showkey: - self._surface_escape(value, "=") + if showkey is not None: + if not showkey: + self._surface_escape(value, "=") + existing.showkey = showkey nodes = existing.value.nodes if force_nonconformity: existing.value = value @@ -143,10 +143,10 @@ class Template(Node): if showkey is None: try: - int(name) - showkey = True + int(unicode(name)) + showkey = False # DEPENDENTS? except ValueError: - showkey = False + showkey = True if not showkey: self._surface_escape(value, "=") if not force_nonconformity: diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 715aa8e..ef81083 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -35,11 +35,16 @@ class Builder(object): self._tokens = [] self._stacks = [] + def _wrap(self, nodes): + return Wikicode(SmartList(nodes)) + def _push(self): self._stacks.append([]) - def _pop(self): - return Wikicode(SmartList(self._stacks.pop())) + def _pop(self, wrap=True): + if wrap: + return self._wrap(self._stacks.pop()) + return self._stacks.pop() def _write(self, item): self._stacks[-1].append(item) @@ -71,9 +76,10 @@ class Builder(object): if isinstance(token, tokens.TemplateParamSeparator): if not params: name = self._pop() - param = self._handle_parameter(min(int_key_range - int_keys)) + default = self._wrap(unicode(min(int_key_range - int_keys))) + param = self._handle_parameter(default) if re.match(r"[1-9][0-9]*$", param.name.strip()): - int_keys.add(int(param.name)) + int_keys.add(int(unicode(param.name))) int_key_range.add(len(int_keys) + 1) params.append(param) elif isinstance(token, tokens.TemplateClose): diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 260a5b1..78b6f4d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -66,7 +66,7 @@ class Tokenizer(object): def _verify_context(self): if self._read() is self.END: - if self._context & contexts.INSIDE_TEMPLATE: + if self._context & contexts.TEMPLATE: raise BadRoute() def _catch_stop(self, stop): @@ -110,6 +110,13 @@ class Tokenizer(object): return self._pop() if self._read(0) == "{" and self._read(1) == "{": self._parse_template() + elif self._read(0) == "|" and self._context & contexts.TEMPLATE: + if self._context & contexts.TEMPLATE_NAME: + self._context ^= contexts.TEMPLATE_NAME + if self._context & contexts.TEMPLATE_PARAM_VALUE: + self._context ^= contexts.TEMPLATE_PARAM_VALUE + self._context |= contexts.TEMPLATE_PARAM_KEY + self._write(tokens.TemplateParamSeparator()) else: self._write(tokens.Text(text=self._read())) self._head += 1 diff --git a/mwparserfromhell/smart_list.py b/mwparserfromhell/smart_list.py index 855aaa2..1d64bce 100644 --- a/mwparserfromhell/smart_list.py +++ b/mwparserfromhell/smart_list.py @@ -81,6 +81,7 @@ class SmartList(list): def __iadd__(self, other): self.extend(other) + return self def append(self, item): head = len(self) @@ -221,6 +222,7 @@ class _ListProxy(list): def __iadd__(self, other): self.extend(other) + return self @property def _start(self): diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 33084b5..9c32c10 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -22,24 +22,25 @@ import mwparserfromhell from .nodes import Node +from .smart_list import SmartList def parse_anything(value): wikicode = mwparserfromhell.wikicode.Wikicode if isinstance(value, wikicode): return value if isinstance(value, Node): - return wikicode([value]) + return wikicode(SmartList([value])) if isinstance(value, basestring): return mwparserfromhell.parse(value) if isinstance(value, int): return mwparserfromhell.parse(unicode(value)) if value is None: - return wikicode([]) + return wikicode(SmartList()) try: - nodelist = [] + nodelist = SmartList() for item in value: nodelist += parse_anything(item).nodes except TypeError: error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" - raise ValueError(error.format(type(value), value)) + raise ValueError(error.format(type(value).__name__, value)) return wikicode(nodelist) diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 7680e20..af22c24 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -104,6 +104,10 @@ class Wikicode(StringMixIn): def nodes(self): return self._nodes + @nodes.setter + def nodes(self, value): + self._nodes = value + def get(self, index): return self.nodes[index] From 69b236ee8d2c8c5af32664ea636f5f6f46bb29ff Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 14 Aug 2012 21:05:48 -0400 Subject: [PATCH 11/26] Handle template parameter key dependents more intelligently. --- mwparserfromhell/nodes/template.py | 15 +++++++++++++-- mwparserfromhell/parser/builder.py | 6 +++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 772a89a..29a2a16 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -143,10 +143,21 @@ class Template(Node): if showkey is None: try: - int(unicode(name)) - showkey = False # DEPENDENTS? + int_name = int(unicode(name)) except ValueError: showkey = True + else: + int_keys = set() + for param in self.params: + try: + int_keys.add(int(unicode(param.name))) + except ValueError: + pass + expected = min(set(range(1, len(int_keys) + 2)) - int_keys) + if expected == int_name: + showkey = False + else: + showkey = True if not showkey: self._surface_escape(value, "=") if not force_nonconformity: diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index ef81083..c4a0055 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -20,8 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import re - from . import tokens from ..nodes import Heading, HTMLEntity, Tag, Template, Text from ..nodes.extras import Attribute, Parameter @@ -78,9 +76,11 @@ class Builder(object): name = self._pop() default = self._wrap(unicode(min(int_key_range - int_keys))) param = self._handle_parameter(default) - if re.match(r"[1-9][0-9]*$", param.name.strip()): + try: int_keys.add(int(unicode(param.name))) int_key_range.add(len(int_keys) + 1) + except ValueError: + pass params.append(param) elif isinstance(token, tokens.TemplateClose): if not params: From 6fefc28cbaa452f4bbacccedfd9f398873eab49b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 14 Aug 2012 21:09:21 -0400 Subject: [PATCH 12/26] Handle integers as MediaWiki does. --- mwparserfromhell/nodes/template.py | 6 ++---- mwparserfromhell/parser/builder.py | 9 ++++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 29a2a16..7240051 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -76,7 +76,7 @@ class Template(Node): code.replace(node, node.replace(char, replacement)) def _blank_param_value(self, value): - match = re.search("^(\s*).*?(\s*)$", unicode(value), FLAGS) + match = re.search(r"^(\s*).*?(\s*)$", unicode(value), FLAGS) value.nodes = [Text(match.group(1)), Text(match.group(2))] def _select_theory(self, theories): @@ -149,10 +149,8 @@ class Template(Node): else: int_keys = set() for param in self.params: - try: + if re.match(r"[1-9][0-9]*$", param.name.strip()): int_keys.add(int(unicode(param.name))) - except ValueError: - pass expected = min(set(range(1, len(int_keys) + 2)) - int_keys) if expected == int_name: showkey = False diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index c4a0055..9d046a6 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -20,6 +20,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import re + from . import tokens from ..nodes import Heading, HTMLEntity, Tag, Template, Text from ..nodes.extras import Attribute, Parameter @@ -76,11 +78,12 @@ class Builder(object): name = self._pop() default = self._wrap(unicode(min(int_key_range - int_keys))) param = self._handle_parameter(default) - try: + if re.match(r"[1-9][0-9]*$", param.name.strip()): + # We try a more restrictive test for integers than + # try: int(), because "01" as a key will pass through int() + # correctly but is not a valid integer key in wikicode: int_keys.add(int(unicode(param.name))) int_key_range.add(len(int_keys) + 1) - except ValueError: - pass params.append(param) elif isinstance(token, tokens.TemplateClose): if not params: From 8dd7b6aada3a1b3887102a6a530e415b24c66b83 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 16 Aug 2012 00:44:14 -0400 Subject: [PATCH 13/26] Handle nested contexts correctly; parameter values; bad parse routes. --- mwparserfromhell/parser/tokenizer.py | 46 +++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 78b6f4d..36086d4 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -36,17 +36,24 @@ class Tokenizer(object): self._text = None self._head = 0 self._stacks = [] - self._context = 0 + + @property + def _context(self): + return self._stacks[-1][1] + + @_context.setter + def _context(self, value): + self._stacks[-1][1] = value def _push(self): - self._stacks.append([]) + self._stacks.append([[], 0]) def _pop(self): - return self._stacks.pop() + return self._stacks.pop()[0] def _write(self, token, stack=None): if stack is None: - stack = self._stacks[-1] + stack = self._stacks[-1][0] if not stack: stack.append(token) return @@ -56,6 +63,11 @@ class Tokenizer(object): else: stack.append(token) + def _write_all(self, tokenlist, stack=None): + if stack is None: + stack = self._stacks[-1][0] + stack.extend(tokenlist) + def _read(self, delta=0, wrap=False): index = self._head + delta if index < 0 and (not wrap or abs(index) > len(self._text)): @@ -64,10 +76,13 @@ class Tokenizer(object): return self.END return self._text[index] + def _at_head(self, chars): + return all([self._read(i) == chars[i] for i in xrange(len(chars))]) + def _verify_context(self): if self._read() is self.END: if self._context & contexts.TEMPLATE: - raise BadRoute() + raise BadRoute(self._pop()) def _catch_stop(self, stop): if self._read() is self.END: @@ -86,37 +101,36 @@ class Tokenizer(object): def _parse_template(self): reset = self._head self._head += 2 - self._context |= contexts.TEMPLATE_NAME try: - template = self._parse_until("}}") + template = self._parse_until("}}", contexts.TEMPLATE_NAME) except BadRoute: self._head = reset self._write(tokens.Text(text=self._read())) else: self._write(tokens.TemplateOpen()) - self._stacks[-1] += template + self._write_all(template) self._write(tokens.TemplateClose()) - ending = (contexts.TEMPLATE_NAME, contexts.TEMPLATE_PARAM_KEY, - contexts.TEMPLATE_PARAM_VALUE) - for context in ending: - self._context ^= context if self._context & context else 0 - - def _parse_until(self, stop): + def _parse_until(self, stop, context=0): self._push() + self._context = context while True: self._verify_context() if self._catch_stop(stop): return self._pop() - if self._read(0) == "{" and self._read(1) == "{": + if self._at_head("{{"): self._parse_template() - elif self._read(0) == "|" and self._context & contexts.TEMPLATE: + elif self._at_head("|") and self._context & contexts.TEMPLATE: if self._context & contexts.TEMPLATE_NAME: self._context ^= contexts.TEMPLATE_NAME if self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE self._context |= contexts.TEMPLATE_PARAM_KEY self._write(tokens.TemplateParamSeparator()) + elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY: + self._context ^= contexts.TEMPLATE_PARAM_KEY + self._context |= contexts.TEMPLATE_PARAM_VALUE + self._write(tokens.TemplateParamEquals()) else: self._write(tokens.Text(text=self._read())) self._head += 1 From e79c79762f3102f998c9dcf487d0a65ef84baaeb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 16 Aug 2012 18:56:59 -0400 Subject: [PATCH 14/26] Pre- and post-stop context verification, plus refactor parse_until(). --- mwparserfromhell/parser/tokenizer.py | 48 +++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 36086d4..2f16d15 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -38,6 +38,10 @@ class Tokenizer(object): self._stacks = [] @property + def _stack(self): + return self._stacks[-1][0] + + @property def _context(self): return self._stacks[-1][1] @@ -46,14 +50,15 @@ class Tokenizer(object): self._stacks[-1][1] = value def _push(self): - self._stacks.append([[], 0]) + stack, context = [], 0 + self._stacks.append([stack, context]) def _pop(self): return self._stacks.pop()[0] def _write(self, token, stack=None): if stack is None: - stack = self._stacks[-1][0] + stack = self._stack if not stack: stack.append(token) return @@ -65,7 +70,7 @@ class Tokenizer(object): def _write_all(self, tokenlist, stack=None): if stack is None: - stack = self._stacks[-1][0] + stack = self._stack stack.extend(tokenlist) def _read(self, delta=0, wrap=False): @@ -79,7 +84,7 @@ class Tokenizer(object): def _at_head(self, chars): return all([self._read(i) == chars[i] for i in xrange(len(chars))]) - def _verify_context(self): + def _verify_context_pre_stop(self): if self._read() is self.END: if self._context & contexts.TEMPLATE: raise BadRoute(self._pop()) @@ -98,6 +103,14 @@ class Tokenizer(object): return True return False + def _verify_context_post_stop(self): + if self._context & contexts.TEMPLATE_NAME and self._stack: + head = self._stack[-1] + if isinstance(head, tokens.Text): + text, this = head.text, self._read() + if text.strip() and text.endswith("\n") and this != "\n": + raise BadRoute(self._pop()) + def _parse_template(self): reset = self._head self._head += 2 @@ -111,26 +124,33 @@ class Tokenizer(object): self._write_all(template) self._write(tokens.TemplateClose()) + def _handle_template_param(self): + if self._context & contexts.TEMPLATE_NAME: + self._context ^= contexts.TEMPLATE_NAME + if self._context & contexts.TEMPLATE_PARAM_VALUE: + self._context ^= contexts.TEMPLATE_PARAM_VALUE + self._context |= contexts.TEMPLATE_PARAM_KEY + self._write(tokens.TemplateParamSeparator()) + + def _handle_template_param_value(self): + self._context ^= contexts.TEMPLATE_PARAM_KEY + self._context |= contexts.TEMPLATE_PARAM_VALUE + self._write(tokens.TemplateParamEquals()) + def _parse_until(self, stop, context=0): self._push() self._context = context while True: - self._verify_context() + self._verify_context_pre_stop() if self._catch_stop(stop): return self._pop() + self._verify_context_post_stop() if self._at_head("{{"): self._parse_template() elif self._at_head("|") and self._context & contexts.TEMPLATE: - if self._context & contexts.TEMPLATE_NAME: - self._context ^= contexts.TEMPLATE_NAME - if self._context & contexts.TEMPLATE_PARAM_VALUE: - self._context ^= contexts.TEMPLATE_PARAM_VALUE - self._context |= contexts.TEMPLATE_PARAM_KEY - self._write(tokens.TemplateParamSeparator()) + self._handle_template_param() elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY: - self._context ^= contexts.TEMPLATE_PARAM_KEY - self._context |= contexts.TEMPLATE_PARAM_VALUE - self._write(tokens.TemplateParamEquals()) + self._handle_template_param_value() else: self._write(tokens.Text(text=self._read())) self._head += 1 From 02fe3b0833642ac7f463a6dad9786987dbe5c228 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 16 Aug 2012 19:43:40 -0400 Subject: [PATCH 15/26] Improve handling of templates in strange cases. --- mwparserfromhell/parser/builder.py | 4 ++-- mwparserfromhell/parser/tokenizer.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 9d046a6..929c269 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -76,8 +76,8 @@ class Builder(object): if isinstance(token, tokens.TemplateParamSeparator): if not params: name = self._pop() - default = self._wrap(unicode(min(int_key_range - int_keys))) - param = self._handle_parameter(default) + default = unicode(min(int_key_range - int_keys)) + param = self._handle_parameter(self._wrap([Text(default)])) if re.match(r"[1-9][0-9]*$", param.name.strip()): # We try a more restrictive test for integers than # try: int(), because "01" as a key will pass through int() diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2f16d15..5befcf0 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -107,9 +107,9 @@ class Tokenizer(object): if self._context & contexts.TEMPLATE_NAME and self._stack: head = self._stack[-1] if isinstance(head, tokens.Text): - text, this = head.text, self._read() - if text.strip() and text.endswith("\n") and this != "\n": - raise BadRoute(self._pop()) + if head.text.strip() and head.text.endswith("\n"): + if self._read() not in ["|", "=", "\n"]: + raise BadRoute(self._pop()) def _parse_template(self): reset = self._head From 010bd346530759ccf56cd1137d2a140b78d9dd37 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 17 Aug 2012 00:56:41 -0400 Subject: [PATCH 16/26] Support HTMLEntities. --- mwparserfromhell/parser/builder.py | 3 ++ mwparserfromhell/parser/tokenizer.py | 56 ++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 929c269..9edc987 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -98,9 +98,12 @@ class Builder(object): token = self._tokens.pop(0) if isinstance(token, tokens.HTMLEntityHex): text = self._tokens.pop(0) + self._tokens.pop(0) # Remove HTMLEntityEnd return HTMLEntity(text.text, named=False, hexadecimal=True, hex_char=token.char) + self._tokens.pop(0) # Remove HTMLEntityEnd return HTMLEntity(token.text, named=False, hexadecimal=False) + self._tokens.pop(0) # Remove HTMLEntityEnd return HTMLEntity(token.text, named=True, hexadecimal=False) def _handle_heading(self, token): diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 5befcf0..fc09462 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -20,6 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import htmlentitydefs +import string + from . import contexts from . import tokens @@ -49,9 +52,8 @@ class Tokenizer(object): def _context(self, value): self._stacks[-1][1] = value - def _push(self): - stack, context = [], 0 - self._stacks.append([stack, context]) + def _push(self, context=0): + self._stacks.append([[], context]) def _pop(self): return self._stacks.pop()[0] @@ -137,9 +139,51 @@ class Tokenizer(object): self._context |= contexts.TEMPLATE_PARAM_VALUE self._write(tokens.TemplateParamEquals()) + def _parse_entity(self): + reset = self._head + self._head += 1 + try: + self._push() + self._write(tokens.HTMLEntityStart()) + numeric = hexadecimal = False + if self._at_head("#"): + numeric = True + self._write(tokens.HTMLEntityNumeric()) + if self._read(1).lower() == "x": + hexadecimal = True + self._write(tokens.HTMLEntityHex(char=self._read(1))) + self._head += 2 + else: + self._head += 1 + text = [] + valid = string.hexdigits if hexadecimal else string.digits + if not numeric and not hexadecimal: + valid += string.ascii_letters + while True: + if self._at_head(";"): + text = "".join(text) + if numeric: + test = int(text, 16) if hexadecimal else int(text) + if test < 1 or test > 0x10FFFF: + raise BadRoute(self._pop()) + else: + if text not in htmlentitydefs.entitydefs: + raise BadRoute(self._pop()) + self._write(tokens.Text(text=text)) + self._write(tokens.HTMLEntityEnd()) + break + if self._read() is self.END or self._read() not in valid: + raise BadRoute(self._pop()) + text.append(self._read()) + self._head += 1 + except BadRoute: + self._head = reset + self._write(tokens.Text(text=self._read())) + else: + self._write_all(self._pop()) + def _parse_until(self, stop, context=0): - self._push() - self._context = context + self._push(context) while True: self._verify_context_pre_stop() if self._catch_stop(stop): @@ -151,6 +195,8 @@ class Tokenizer(object): self._handle_template_param() elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY: self._handle_template_param_value() + elif self._at_head("&"): + self._parse_entity() else: self._write(tokens.Text(text=self._read())) self._head += 1 From 2f4ce003b4d88abcf25ce6a407f7849852bad831 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 17 Aug 2012 03:58:07 -0400 Subject: [PATCH 17/26] Massive speedup and refactoring. --- mwparserfromhell/parser/tokenizer.py | 113 +++++++++++++++++++---------------- mwparserfromhell/parser/tokens.py | 3 + 2 files changed, 63 insertions(+), 53 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index fc09462..df0cf12 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -34,6 +34,7 @@ class BadRoute(Exception): class Tokenizer(object): START = object() END = object() + SENTINELS = ["{", "}", "[", "]", "|", "=", "&", END] def __init__(self): self._text = None @@ -52,28 +53,38 @@ class Tokenizer(object): def _context(self, value): self._stacks[-1][1] = value + @property + def _textbuffer(self): + return self._stacks[-1][2] + + @_textbuffer.setter + def _textbuffer(self, value): + self._stacks[-1][2] = value + def _push(self, context=0): - self._stacks.append([[], context]) + self._stacks.append([[], context, []]) def _pop(self): - return self._stacks.pop()[0] - - def _write(self, token, stack=None): - if stack is None: - stack = self._stack - if not stack: - stack.append(token) + top = self._stacks.pop() + stack, text = top[0], top[2] + if text: + stack.append(tokens.Text(text="".join(text))) + return stack + + def _write(self, data, text=False): + if text: + self._textbuffer.append(data) return - last = stack[-1] - if isinstance(token, tokens.Text) and isinstance(last, tokens.Text): - last.text += token.text - else: - stack.append(token) + if self._textbuffer: + self._stack.append(tokens.Text(text="".join(self._textbuffer))) + self._textbuffer = [] + self._stack.append(data) - def _write_all(self, tokenlist, stack=None): - if stack is None: - stack = self._stack - stack.extend(tokenlist) + def _write_all(self, tokenlist): + if self._textbuffer: + self._stack.append(tokens.Text(text="".join(self._textbuffer))) + self._textbuffer = [] + self._stack.extend(tokenlist) def _read(self, delta=0, wrap=False): index = self._head + delta @@ -84,50 +95,34 @@ class Tokenizer(object): return self._text[index] def _at_head(self, chars): + length = len(chars) + if length == 1: + return self._read() == chars return all([self._read(i) == chars[i] for i in xrange(len(chars))]) - def _verify_context_pre_stop(self): - if self._read() is self.END: - if self._context & contexts.TEMPLATE: - raise BadRoute(self._pop()) - - def _catch_stop(self, stop): - if self._read() is self.END: - return True - try: - iter(stop) - except TypeError: - if self._read() is stop: - return True - else: - if all([self._read(i) == stop[i] for i in xrange(len(stop))]): - self._head += len(stop) - 1 - return True - return False - - def _verify_context_post_stop(self): - if self._context & contexts.TEMPLATE_NAME and self._stack: - head = self._stack[-1] - if isinstance(head, tokens.Text): - if head.text.strip() and head.text.endswith("\n"): - if self._read() not in ["|", "=", "\n"]: - raise BadRoute(self._pop()) - def _parse_template(self): reset = self._head self._head += 2 try: - template = self._parse_until("}}", contexts.TEMPLATE_NAME) + template = self._parse(contexts.TEMPLATE_NAME) except BadRoute: self._head = reset - self._write(tokens.Text(text=self._read())) + self._write(self._read(), text=True) else: self._write(tokens.TemplateOpen()) self._write_all(template) self._write(tokens.TemplateClose()) + def _verify_template_name(self): + if self._stack: + text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] + text = "".join([token.text for token in text]) + if text.strip() and "\n" in text: + raise BadRoute(self._pop()) + def _handle_template_param(self): if self._context & contexts.TEMPLATE_NAME: + self._verify_template_name() self._context ^= contexts.TEMPLATE_NAME if self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE @@ -139,6 +134,12 @@ class Tokenizer(object): self._context |= contexts.TEMPLATE_PARAM_VALUE self._write(tokens.TemplateParamEquals()) + def _handle_template_end(self): + if self._context & contexts.TEMPLATE_NAME: + self._verify_template_name() + self._head += 1 + return self._pop() + def _parse_entity(self): reset = self._head self._head += 1 @@ -178,29 +179,35 @@ class Tokenizer(object): self._head += 1 except BadRoute: self._head = reset - self._write(tokens.Text(text=self._read())) + self._write(self._read(), text=True) else: self._write_all(self._pop()) - def _parse_until(self, stop, context=0): + def _parse(self, context=0): self._push(context) while True: - self._verify_context_pre_stop() - if self._catch_stop(stop): + if self._read() not in self.SENTINELS: + self._write(self._read(), text=True) + self._head += 1 + continue + if self._read() is self.END: + if self._context & contexts.TEMPLATE: + raise BadRoute(self._pop()) return self._pop() - self._verify_context_post_stop() if self._at_head("{{"): self._parse_template() elif self._at_head("|") and self._context & contexts.TEMPLATE: self._handle_template_param() elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY: self._handle_template_param_value() + elif self._at_head("}}") and self._context & contexts.TEMPLATE: + return self._handle_template_end() elif self._at_head("&"): self._parse_entity() else: - self._write(tokens.Text(text=self._read())) + self._write(self._read(), text=True) self._head += 1 def tokenize(self, text): self._text = list(text) - return self._parse_until(stop=self.END) + return self._parse() diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 6c77a5f..a465227 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -26,6 +26,9 @@ class Token(object): def __init__(self, **kwargs): super(Token, self).__setattr__("_kwargs", kwargs) + def __repr__(self): + return type(self).__name__ + def __getattr__(self, key): return self._kwargs[key] From 31103b5891eaf834924ce2be6c3e10f9f77181fe Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 17 Aug 2012 04:07:24 -0400 Subject: [PATCH 18/26] _push_textbuffer() in _verify_template_name() --- mwparserfromhell/parser/tokenizer.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index df0cf12..ecb90e8 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -64,26 +64,24 @@ class Tokenizer(object): def _push(self, context=0): self._stacks.append([[], context, []]) + def _push_textbuffer(self): + if self._textbuffer: + self._stack.append(tokens.Text(text="".join(self._textbuffer))) + self._textbuffer = [] + def _pop(self): - top = self._stacks.pop() - stack, text = top[0], top[2] - if text: - stack.append(tokens.Text(text="".join(text))) - return stack + self._push_textbuffer() + return self._stacks.pop()[0] def _write(self, data, text=False): if text: self._textbuffer.append(data) return - if self._textbuffer: - self._stack.append(tokens.Text(text="".join(self._textbuffer))) - self._textbuffer = [] + self._push_textbuffer() self._stack.append(data) def _write_all(self, tokenlist): - if self._textbuffer: - self._stack.append(tokens.Text(text="".join(self._textbuffer))) - self._textbuffer = [] + self._push_textbuffer() self._stack.extend(tokenlist) def _read(self, delta=0, wrap=False): @@ -114,10 +112,12 @@ class Tokenizer(object): self._write(tokens.TemplateClose()) def _verify_template_name(self): + self._push_textbuffer() if self._stack: text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] + print text text = "".join([token.text for token in text]) - if text.strip() and "\n" in text: + if text.strip() and "\n" in text.strip(): raise BadRoute(self._pop()) def _handle_template_param(self): From 889fd316d3b5450a3ab64015fe86ea827ddfbaf7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 17 Aug 2012 04:08:21 -0400 Subject: [PATCH 19/26] A rogue print statement appears! --- mwparserfromhell/parser/tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index ecb90e8..6e6c05b 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -115,7 +115,6 @@ class Tokenizer(object): self._push_textbuffer() if self._stack: text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] - print text text = "".join([token.text for token in text]) if text.strip() and "\n" in text.strip(): raise BadRoute(self._pop()) From 3fd13100da734ccf6c85756aad67b17d014c1c7d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 17 Aug 2012 04:24:52 -0400 Subject: [PATCH 20/26] Another speedup by reducing calls to _read(). --- mwparserfromhell/parser/tokenizer.py | 38 +++++++++++++++++------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 6e6c05b..eb42c28 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -88,15 +88,10 @@ class Tokenizer(object): index = self._head + delta if index < 0 and (not wrap or abs(index) > len(self._text)): return self.START - if index >= len(self._text): + try: + return self._text[index] + except IndexError: return self.END - return self._text[index] - - def _at_head(self, chars): - length = len(chars) - if length == 1: - return self._read() == chars - return all([self._read(i) == chars[i] for i in xrange(len(chars))]) def _parse_template(self): reset = self._head @@ -146,7 +141,7 @@ class Tokenizer(object): self._push() self._write(tokens.HTMLEntityStart()) numeric = hexadecimal = False - if self._at_head("#"): + if self._read() == "#": numeric = True self._write(tokens.HTMLEntityNumeric()) if self._read(1).lower() == "x": @@ -160,7 +155,8 @@ class Tokenizer(object): if not numeric and not hexadecimal: valid += string.ascii_letters while True: - if self._at_head(";"): + this = self._read() + if this == ";": text = "".join(text) if numeric: test = int(text, 16) if hexadecimal else int(text) @@ -172,9 +168,9 @@ class Tokenizer(object): self._write(tokens.Text(text=text)) self._write(tokens.HTMLEntityEnd()) break - if self._read() is self.END or self._read() not in valid: + if this is self.END or this not in valid: raise BadRoute(self._pop()) - text.append(self._read()) + text.append(this) self._head += 1 except BadRoute: self._head = reset @@ -185,26 +181,28 @@ class Tokenizer(object): def _parse(self, context=0): self._push(context) while True: - if self._read() not in self.SENTINELS: + this = self._read() + if this not in self.SENTINELS: self._write(self._read(), text=True) self._head += 1 continue - if self._read() is self.END: + if this is self.END: if self._context & contexts.TEMPLATE: raise BadRoute(self._pop()) return self._pop() - if self._at_head("{{"): + next = self._read(1) + if this == next == "{": self._parse_template() - elif self._at_head("|") and self._context & contexts.TEMPLATE: + elif this == "|" and self._context & contexts.TEMPLATE: self._handle_template_param() - elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY: + elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: self._handle_template_param_value() - elif self._at_head("}}") and self._context & contexts.TEMPLATE: + elif this == next == "}" and self._context & contexts.TEMPLATE: return self._handle_template_end() - elif self._at_head("&"): + elif this == "&": self._parse_entity() else: - self._write(self._read(), text=True) + self._write(this, text=True) self._head += 1 def tokenize(self, text): From aec66b0db717da68035d5e4ba8e800485e68a475 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 17 Aug 2012 04:30:03 -0400 Subject: [PATCH 21/26] Missed another call (2.0 seconds -> 1.5 seconds for 1,000,000 chars). --- mwparserfromhell/parser/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index eb42c28..7485afb 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -183,7 +183,7 @@ class Tokenizer(object): while True: this = self._read() if this not in self.SENTINELS: - self._write(self._read(), text=True) + self._write(this, text=True) self._head += 1 continue if this is self.END: From 278594a8cf418802556a597a8afa80dd217b91b9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 17 Aug 2012 19:42:17 -0400 Subject: [PATCH 22/26] Faster parsing: split the text on sentinels instead of every letter. --- mwparserfromhell/parser/tokenizer.py | 54 ++++++++++++++++++------------------ mwparserfromhell/parser/tokens.py | 8 +++++- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7485afb..4661157 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,6 +21,7 @@ # SOFTWARE. import htmlentitydefs +import re import string from . import contexts @@ -35,6 +36,7 @@ class Tokenizer(object): START = object() END = object() SENTINELS = ["{", "}", "[", "]", "|", "=", "&", END] + REGEX = r"([{}\[\]|=&;])" def __init__(self): self._text = None @@ -135,48 +137,45 @@ class Tokenizer(object): return self._pop() def _parse_entity(self): - reset = self._head - self._head += 1 try: self._push() self._write(tokens.HTMLEntityStart()) + this = self._read(1) + if this is self.END: + raise BadRoute(self._pop()) numeric = hexadecimal = False - if self._read() == "#": + skip = 0 + if this.startswith("#"): numeric = True self._write(tokens.HTMLEntityNumeric()) - if self._read(1).lower() == "x": + if this[1:].lower().startswith("x"): hexadecimal = True - self._write(tokens.HTMLEntityHex(char=self._read(1))) - self._head += 2 + self._write(tokens.HTMLEntityHex(char=this[1])) + skip = 2 else: - self._head += 1 - text = [] + skip = 1 + text = this[skip:] valid = string.hexdigits if hexadecimal else string.digits if not numeric and not hexadecimal: valid += string.ascii_letters - while True: - this = self._read() - if this == ";": - text = "".join(text) - if numeric: - test = int(text, 16) if hexadecimal else int(text) - if test < 1 or test > 0x10FFFF: - raise BadRoute(self._pop()) - else: - if text not in htmlentitydefs.entitydefs: - raise BadRoute(self._pop()) - self._write(tokens.Text(text=text)) - self._write(tokens.HTMLEntityEnd()) - break - if this is self.END or this not in valid: + if not text or not all([char in valid for char in text]): + raise BadRoute(self._pop()) + if self._read(2) != ";": + raise BadRoute(self._pop()) + if numeric: + test = int(text, 16) if hexadecimal else int(text) + if test < 1 or test > 0x10FFFF: raise BadRoute(self._pop()) - text.append(this) - self._head += 1 + else: + if text not in htmlentitydefs.entitydefs: + raise BadRoute(self._pop()) + self._write(tokens.Text(text=text)) + self._write(tokens.HTMLEntityEnd()) except BadRoute: - self._head = reset self._write(self._read(), text=True) else: self._write_all(self._pop()) + self._head += 2 def _parse(self, context=0): self._push(context) @@ -206,5 +205,6 @@ class Tokenizer(object): self._head += 1 def tokenize(self, text): - self._text = list(text) + split = re.split(self.REGEX, text, flags=re.I) + self._text = [segment for segment in split if segment] return self._parse() diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index a465227..a5f74fc 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -27,7 +27,13 @@ class Token(object): super(Token, self).__setattr__("_kwargs", kwargs) def __repr__(self): - return type(self).__name__ + args = [] + for key, value in self._kwargs.iteritems(): + if len(value) > 100: + args.append(key + "=" + repr(value[:97] + "...")) + else: + args.append(key + "=" + repr(value)) + return u"{0}({1})".format(type(self).__name__, u", ".join(args)) def __getattr__(self, key): return self._kwargs[key] From 664b6e39ece25e46f1306f8b127d289668f13d5b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 18 Aug 2012 15:39:41 -0400 Subject: [PATCH 23/26] _write(text=True) -> _write_text(); __eq__ for Tokens --- mwparserfromhell/parser/tokenizer.py | 20 ++++++++++---------- mwparserfromhell/parser/tokens.py | 5 +++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 4661157..fcc75f7 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -75,12 +75,12 @@ class Tokenizer(object): self._push_textbuffer() return self._stacks.pop()[0] - def _write(self, data, text=False): - if text: - self._textbuffer.append(data) - return + def _write(self, token): self._push_textbuffer() - self._stack.append(data) + self._stack.append(token) + + def _write_text(self, text): + self._textbuffer.append(text) def _write_all(self, tokenlist): self._push_textbuffer() @@ -102,7 +102,7 @@ class Tokenizer(object): template = self._parse(contexts.TEMPLATE_NAME) except BadRoute: self._head = reset - self._write(self._read(), text=True) + self._write_text(self._read()) else: self._write(tokens.TemplateOpen()) self._write_all(template) @@ -137,8 +137,8 @@ class Tokenizer(object): return self._pop() def _parse_entity(self): + self._push() try: - self._push() self._write(tokens.HTMLEntityStart()) this = self._read(1) if this is self.END: @@ -172,7 +172,7 @@ class Tokenizer(object): self._write(tokens.Text(text=text)) self._write(tokens.HTMLEntityEnd()) except BadRoute: - self._write(self._read(), text=True) + self._write_text(self._read()) else: self._write_all(self._pop()) self._head += 2 @@ -182,7 +182,7 @@ class Tokenizer(object): while True: this = self._read() if this not in self.SENTINELS: - self._write(this, text=True) + self._write_text(this) self._head += 1 continue if this is self.END: @@ -201,7 +201,7 @@ class Tokenizer(object): elif this == "&": self._parse_entity() else: - self._write(this, text=True) + self._write_text(this) self._head += 1 def tokenize(self, text): diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index a5f74fc..88881c7 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -35,6 +35,11 @@ class Token(object): args.append(key + "=" + repr(value)) return u"{0}({1})".format(type(self).__name__, u", ".join(args)) + def __eq__(self, other): + if isinstance(other, type(self)): + return self._kwargs == other._kwargs + return False + def __getattr__(self, key): return self._kwargs[key] From 40b6480aa7c454ec10abf74a8cda7961a2485958 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 18 Aug 2012 16:53:32 -0400 Subject: [PATCH 24/26] Split on all characters that we use. --- mwparserfromhell/parser/tokenizer.py | 42 ++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index fcc75f7..a2606e9 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -35,8 +35,9 @@ class BadRoute(Exception): class Tokenizer(object): START = object() END = object() - SENTINELS = ["{", "}", "[", "]", "|", "=", "&", END] - REGEX = r"([{}\[\]|=&;])" + SENTINELS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", + ":", "/", "-", END] + regex = re.compile(r"([{}\[\]<>|=&#*;:/-])", flags=re.IGNORECASE) def __init__(self): self._text = None @@ -137,45 +138,48 @@ class Tokenizer(object): return self._pop() def _parse_entity(self): + reset = self._head self._push() try: self._write(tokens.HTMLEntityStart()) - this = self._read(1) - if this is self.END: + self._head += 1 + this = self._read() + if not this or this is self.END: raise BadRoute(self._pop()) numeric = hexadecimal = False - skip = 0 - if this.startswith("#"): + if this == "#": numeric = True self._write(tokens.HTMLEntityNumeric()) - if this[1:].lower().startswith("x"): + self._head += 1 + this = self._read() + if not this or this is self.END: + raise BadRoute(self._pop()) + if this[0].lower() == "x": hexadecimal = True - self._write(tokens.HTMLEntityHex(char=this[1])) - skip = 2 - else: - skip = 1 - text = this[skip:] + self._write(tokens.HTMLEntityHex(char=this[0])) + this = this[1:] valid = string.hexdigits if hexadecimal else string.digits if not numeric and not hexadecimal: valid += string.ascii_letters - if not text or not all([char in valid for char in text]): + if not all([char in valid for char in this]): raise BadRoute(self._pop()) - if self._read(2) != ";": + self._head += 1 + if self._read() != ";": raise BadRoute(self._pop()) if numeric: - test = int(text, 16) if hexadecimal else int(text) + test = int(this, 16) if hexadecimal else int(this) if test < 1 or test > 0x10FFFF: raise BadRoute(self._pop()) else: - if text not in htmlentitydefs.entitydefs: + if this not in htmlentitydefs.entitydefs: raise BadRoute(self._pop()) - self._write(tokens.Text(text=text)) + self._write(tokens.Text(text=this)) self._write(tokens.HTMLEntityEnd()) except BadRoute: + self._head = reset self._write_text(self._read()) else: self._write_all(self._pop()) - self._head += 2 def _parse(self, context=0): self._push(context) @@ -205,6 +209,6 @@ class Tokenizer(object): self._head += 1 def tokenize(self, text): - split = re.split(self.REGEX, text, flags=re.I) + split = self.regex.split(text) self._text = [segment for segment in split if segment] return self._parse() From e57b6bdd93b1a41ac066dec8407fe649f8d45169 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Aug 2012 20:00:31 -0400 Subject: [PATCH 25/26] Support Headings in tokenizer; handle tokens backwards in builder. * Some other fixes, additions. --- mwparserfromhell/nodes/template.py | 9 +- mwparserfromhell/parser/builder.py | 54 ++++++------ mwparserfromhell/parser/contexts.py | 23 +++++- mwparserfromhell/parser/tokenizer.py | 155 +++++++++++++++++++++++++---------- mwparserfromhell/parser/tokens.py | 5 +- 5 files changed, 165 insertions(+), 81 deletions(-) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 7240051..5b17351 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -118,7 +118,7 @@ class Template(Node): def get(self, name): name = name.strip() if isinstance(name, basestring) else unicode(name) - for param in self.params: + for param in reversed(self.params): if param.name.strip() == name: return param raise ValueError(name) @@ -149,8 +149,9 @@ class Template(Node): else: int_keys = set() for param in self.params: - if re.match(r"[1-9][0-9]*$", param.name.strip()): - int_keys.add(int(unicode(param.name))) + if not param.showkey: + if re.match(r"[1-9][0-9]*$", param.name.strip()): + int_keys.add(int(unicode(param.name))) expected = min(set(range(1, len(int_keys) + 2)) - int_keys) if expected == int_name: showkey = False @@ -170,7 +171,7 @@ class Template(Node): self.params.append(param) return param - def remove(self, name, keep_field=False, force_no_field=False): + def remove(self, name, keep_field=False, force_no_field=False): # KEEP FIRST FIELD, REMOVE ALL AFTER name = name.strip() if isinstance(name, basestring) else unicode(name) for i, param in enumerate(self.params): if param.name.strip() == name: diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 9edc987..d352321 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -20,8 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import re - from . import tokens from ..nodes import Heading, HTMLEntity, Tag, Template, Text from ..nodes.extras import Attribute, Parameter @@ -49,42 +47,39 @@ class Builder(object): def _write(self, item): self._stacks[-1].append(item) - def _handle_parameter(self, key): + def _handle_parameter(self, default): + key = None showkey = False self._push() while self._tokens: - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.TemplateParamEquals): key = self._pop() showkey = True self._push() elif isinstance(token, (tokens.TemplateParamSeparator, tokens.TemplateClose)): - self._tokens.insert(0, token) + self._tokens.append(token) value = self._pop() + if not key: + key = self._wrap([Text(unicode(default))]) return Parameter(key, value, showkey) else: self._write(self._handle_token(token)) def _handle_template(self): params = [] - int_keys = set() - int_key_range = {1} + default = 1 self._push() while self._tokens: - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.TemplateParamSeparator): if not params: name = self._pop() - default = unicode(min(int_key_range - int_keys)) - param = self._handle_parameter(self._wrap([Text(default)])) - if re.match(r"[1-9][0-9]*$", param.name.strip()): - # We try a more restrictive test for integers than - # try: int(), because "01" as a key will pass through int() - # correctly but is not a valid integer key in wikicode: - int_keys.add(int(unicode(param.name))) - int_key_range.add(len(int_keys) + 1) + param = self._handle_parameter(default) params.append(param) + if not param.showkey: + default += 1 elif isinstance(token, tokens.TemplateClose): if not params: name = self._pop() @@ -93,25 +88,25 @@ class Builder(object): self._write(self._handle_token(token)) def _handle_entity(self): - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.HTMLEntityNumeric): - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.HTMLEntityHex): - text = self._tokens.pop(0) - self._tokens.pop(0) # Remove HTMLEntityEnd + text = self._tokens.pop() + self._tokens.pop() # Remove HTMLEntityEnd return HTMLEntity(text.text, named=False, hexadecimal=True, hex_char=token.char) - self._tokens.pop(0) # Remove HTMLEntityEnd + self._tokens.pop() # Remove HTMLEntityEnd return HTMLEntity(token.text, named=False, hexadecimal=False) - self._tokens.pop(0) # Remove HTMLEntityEnd + self._tokens.pop() # Remove HTMLEntityEnd return HTMLEntity(token.text, named=True, hexadecimal=False) def _handle_heading(self, token): level = token.level self._push() while self._tokens: - token = self._tokens.pop(0) - if isinstance(token, tokens.HeadingBlock): + token = self._tokens.pop() + if isinstance(token, tokens.HeadingEnd): title = self._pop() return Heading(title, level) else: @@ -121,7 +116,7 @@ class Builder(object): name, quoted = None, False self._push() while self._tokens: - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.TagAttrEquals): name = self._pop() self._push() @@ -129,7 +124,7 @@ class Builder(object): quoted = True elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen)): - self._tokens.insert(0, token) + self._tokens.append(token) if name is not None: return Attribute(name, self._pop(), quoted) return Attribute(self._pop(), quoted=quoted) @@ -141,7 +136,7 @@ class Builder(object): attrs = [] self._push() while self._tokens: - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute()) elif isinstance(token, tokens.TagCloseOpen): @@ -167,15 +162,16 @@ class Builder(object): return self._handle_template() elif isinstance(token, tokens.HTMLEntityStart): return self._handle_entity() - elif isinstance(token, tokens.HeadingBlock): + elif isinstance(token, tokens.HeadingStart): return self._handle_heading(token) elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) def build(self, tokenlist): self._tokens = tokenlist + self._tokens.reverse() self._push() while self._tokens: - node = self._handle_token(self._tokens.pop(0)) + node = self._handle_token(self._tokens.pop()) self._write(node) return self._pop() diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index f966a1b..6369ee2 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -20,7 +20,22 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -TEMPLATE = 0b111 -TEMPLATE_NAME = 0b001 -TEMPLATE_PARAM_KEY = 0b010 -TEMPLATE_PARAM_VALUE = 0b100 +# Local (stack-specific) contexts: + +TEMPLATE = 0b000000111 +TEMPLATE_NAME = 0b000000001 +TEMPLATE_PARAM_KEY = 0b000000010 +TEMPLATE_PARAM_VALUE = 0b000000100 + +HEADING = 0b111111000 +HEADING_LEVEL_1 = 0b000001000 +HEADING_LEVEL_2 = 0b000010000 +HEADING_LEVEL_3 = 0b000100000 +HEADING_LEVEL_4 = 0b001000000 +HEADING_LEVEL_5 = 0b010000000 +HEADING_LEVEL_6 = 0b100000000 + + +# Global contexts: + +GL_HEADING = 0b1 diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index a2606e9..159ba67 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,6 +21,7 @@ # SOFTWARE. import htmlentitydefs +from math import log import re import string @@ -32,17 +33,19 @@ __all__ = ["Tokenizer"] class BadRoute(Exception): pass + class Tokenizer(object): START = object() END = object() - SENTINELS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", - ":", "/", "-", END] - regex = re.compile(r"([{}\[\]<>|=&#*;:/-])", flags=re.IGNORECASE) + MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", + "/", "-", "\n", END] + regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE) def __init__(self): self._text = None self._head = 0 self._stacks = [] + self._global = 0 @property def _stack(self): @@ -76,6 +79,10 @@ class Tokenizer(object): self._push_textbuffer() return self._stacks.pop()[0] + def _fail_route(self): + self._pop() + raise BadRoute() + def _write(self, token): self._push_textbuffer() self._stack.append(token) @@ -84,16 +91,20 @@ class Tokenizer(object): self._textbuffer.append(text) def _write_all(self, tokenlist): + if tokenlist and isinstance(tokenlist[0], tokens.Text): + self._write_text(tokenlist.pop(0).text) self._push_textbuffer() self._stack.extend(tokenlist) - def _read(self, delta=0, wrap=False): + def _read(self, delta=0, wrap=False, strict=False): index = self._head + delta if index < 0 and (not wrap or abs(index) > len(self._text)): return self.START try: return self._text[index] except IndexError: + if strict: + self._fail_route() return self.END def _parse_template(self): @@ -115,7 +126,7 @@ class Tokenizer(object): text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] text = "".join([token.text for token in text]) if text.strip() and "\n" in text.strip(): - raise BadRoute(self._pop()) + self._fail_route() def _handle_template_param(self): if self._context & contexts.TEMPLATE_NAME: @@ -137,44 +148,98 @@ class Tokenizer(object): self._head += 1 return self._pop() - def _parse_entity(self): + def _parse_heading(self): + self._global |= contexts.GL_HEADING reset = self._head - self._push() + self._head += 1 + best = 1 + while self._read() == "=": + best += 1 + self._head += 1 + context = contexts.HEADING_LEVEL_1 << min(best - 1, 5) + try: - self._write(tokens.HTMLEntityStart()) + title, level = self._parse(context) + except BadRoute: + self._head = reset + best - 1 + self._write_text("=" * best) + else: + self._write(tokens.HeadingStart(level=level)) + if level < best: + self._write_text("=" * (best - level)) + self._write_all(title) + self._write(tokens.HeadingEnd()) + finally: + self._global ^= contexts.GL_HEADING + + def _handle_heading_end(self): + reset = self._head + self._head += 1 + best = 1 + while self._read() == "=": + best += 1 self._head += 1 - this = self._read() - if not this or this is self.END: - raise BadRoute(self._pop()) - numeric = hexadecimal = False - if this == "#": - numeric = True - self._write(tokens.HTMLEntityNumeric()) - self._head += 1 - this = self._read() - if not this or this is self.END: - raise BadRoute(self._pop()) - if this[0].lower() == "x": - hexadecimal = True - self._write(tokens.HTMLEntityHex(char=this[0])) - this = this[1:] - valid = string.hexdigits if hexadecimal else string.digits - if not numeric and not hexadecimal: - valid += string.ascii_letters - if not all([char in valid for char in this]): - raise BadRoute(self._pop()) + current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1 + level = min(current, min(best, 6)) + + try: + after, after_level = self._parse(self._context) + except BadRoute: + if level < best: + self._write_text("=" * (best - level)) + self._head = reset + best - 1 + return self._pop(), level + else: + self._write_text("=" * best) + self._write_all(after) + return self._pop(), after_level + + def _really_parse_entity(self): + self._write(tokens.HTMLEntityStart()) + self._head += 1 + + this = self._read(strict=True) + if this == "#": + numeric = True + self._write(tokens.HTMLEntityNumeric()) self._head += 1 - if self._read() != ";": - raise BadRoute(self._pop()) - if numeric: - test = int(this, 16) if hexadecimal else int(this) - if test < 1 or test > 0x10FFFF: - raise BadRoute(self._pop()) + this = self._read(strict=True) + if this[0].lower() == "x": + hexadecimal = True + self._write(tokens.HTMLEntityHex(char=this[0])) + this = this[1:] + if not this: + self._fail_route() else: - if this not in htmlentitydefs.entitydefs: - raise BadRoute(self._pop()) - self._write(tokens.Text(text=this)) - self._write(tokens.HTMLEntityEnd()) + hexadecimal = False + else: + numeric = hexadecimal = False + + valid = string.hexdigits if hexadecimal else string.digits + if not numeric and not hexadecimal: + valid += string.ascii_letters + if not all([char in valid for char in this]): + self._fail_route() + + self._head += 1 + if self._read() != ";": + self._fail_route() + if numeric: + test = int(this, 16) if hexadecimal else int(this) + if test < 1 or test > 0x10FFFF: + self._fail_route() + else: + if this not in htmlentitydefs.entitydefs: + self._fail_route() + + self._write(tokens.Text(text=this)) + self._write(tokens.HTMLEntityEnd()) + + def _parse_entity(self): + reset = self._head + self._push() + try: + self._really_parse_entity() except BadRoute: self._head = reset self._write_text(self._read()) @@ -185,15 +250,15 @@ class Tokenizer(object): self._push(context) while True: this = self._read() - if this not in self.SENTINELS: + if this not in self.MARKERS: self._write_text(this) self._head += 1 continue if this is self.END: - if self._context & contexts.TEMPLATE: - raise BadRoute(self._pop()) + if self._context & (contexts.TEMPLATE | contexts.HEADING): + self._fail_route() return self._pop() - next = self._read(1) + prev, next = self._read(-1), self._read(1) if this == next == "{": self._parse_template() elif this == "|" and self._context & contexts.TEMPLATE: @@ -202,6 +267,12 @@ class Tokenizer(object): self._handle_template_param_value() elif this == next == "}" and self._context & contexts.TEMPLATE: return self._handle_template_end() + elif (prev == "\n" or prev == self.START) and this == "=" and not self._global & contexts.GL_HEADING: + self._parse_heading() + elif this == "=" and self._context & contexts.HEADING: + return self._handle_heading_end() + elif this == "\n" and self._context & contexts.HEADING: + self._fail_route() elif this == "&": self._parse_entity() else: diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 88881c7..3cb73c9 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -29,7 +29,7 @@ class Token(object): def __repr__(self): args = [] for key, value in self._kwargs.iteritems(): - if len(value) > 100: + if isinstance(value, basestring) and len(value) > 100: args.append(key + "=" + repr(value[:97] + "...")) else: args.append(key + "=" + repr(value)) @@ -66,7 +66,8 @@ HTMLEntityNumeric = make("HTMLEntityNumeric") # # HTMLEntityHex = make("HTMLEntityHex") # x HTMLEntityEnd = make("HTMLEntityEnd") # ; -HeadingBlock = make("HeadingBlock") # =... +HeadingStart = make("HeadingStart") # =... +HeadingEnd = make("HeadingEnd") # =... TagOpenOpen = make("TagOpenOpen") # < TagAttrStart = make("TagAttrStart") From fc66ce065490d55b9ad54e09d5e1dda168df706c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 20 Aug 2012 17:24:16 -0400 Subject: [PATCH 26/26] Update remove() for when a templates has the same key multiple times. --- mwparserfromhell/nodes/template.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 5b17351..a1cbd61 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -99,6 +99,21 @@ class Template(Node): after = self._select_theory(after_theories) return before, after + def _remove_with_field(self, param, i, name): + if param.showkey: + following = self.params[i+1:] + better_matches = [after.name.strip() == name and not after.showkey for after in following] + if any(better_matches): + return False + return True + + def _remove_without_field(self, param, i, force_no_field): + if not param.showkey and not force_no_field: + dependents = [not after.showkey for after in self.params[i+1:]] + if any(dependents): + return False + return True + @property def name(self): return self._name @@ -171,14 +186,23 @@ class Template(Node): self.params.append(param) return param - def remove(self, name, keep_field=False, force_no_field=False): # KEEP FIRST FIELD, REMOVE ALL AFTER + def remove(self, name, keep_field=False, force_no_field=False): name = name.strip() if isinstance(name, basestring) else unicode(name) + removed = False for i, param in enumerate(self.params): if param.name.strip() == name: if keep_field: - return self._blank_param_value(param.value) - dependent = [not after.showkey for after in self.params[i+1:]] - if any(dependent) and not param.showkey and not force_no_field: - return self._blank_param_value(param.value) - return self.params.remove(param) - raise ValueError(name) + if self._remove_with_field(param, i, name): + self._blank_param_value(param.value) + keep_field = False + else: + self.params.remove(param) + else: + if self._remove_without_field(param, i, force_no_field): + self.params.remove(param) + else: + self._blank_param_value(param.value) + if not removed: + removed = True + if not removed: + raise ValueError(name)