From 33acb3eea3d9cedf9ca7def41499487860b2a94b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 6 Aug 2012 00:44:06 -0400 Subject: [PATCH] Starting main parser: Parser, Tokens, Tokenizer, Builder, BuildStack. --- mwparserfromhell/nodes/__init__.py | 2 +- mwparserfromhell/parser/__init__.py | 16 +++++- mwparserfromhell/parser/build_stack.py | 36 +++++++++++++ mwparserfromhell/parser/builder.py | 93 ++++++++++++++++++++++++++++++++++ mwparserfromhell/parser/demo.py | 53 ------------------- mwparserfromhell/parser/tokenizer.py | 30 +++++++++++ mwparserfromhell/parser/tokens.py | 65 ++++++++++++++++++++++++ 7 files changed, 240 insertions(+), 55 deletions(-) create mode 100644 mwparserfromhell/parser/build_stack.py create mode 100644 mwparserfromhell/parser/builder.py delete mode 100644 mwparserfromhell/parser/demo.py create mode 100644 mwparserfromhell/parser/tokenizer.py create mode 100644 mwparserfromhell/parser/tokens.py diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index f749e71..0777479 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -22,7 +22,7 @@ from ..string_mixin import StringMixIn -__all__ = ["Node"] +__all__ = ["Node", "Text", "Heading", "HTMLEntity", "Tag", "Template"] class Node(StringMixIn): def __iternodes__(self, getter): diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index c32a549..f70273f 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -20,4 +20,18 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from .demo import DemoParser as Parser +from .builder import Builder +from .tokenizer import Tokenizer + +__all__ = ["Parser"] + +class Parser(object): + def __init__(self, text): + self.text = text + self._tokenizer = Tokenizer() + self._builder = Builder() + + def parse(self): + tokens = self._tokenizer.tokenize(self.text) + code = self._builder.build(tokens) + return code diff --git a/mwparserfromhell/parser/build_stack.py b/mwparserfromhell/parser/build_stack.py new file mode 100644 index 0000000..66cc67b --- /dev/null +++ b/mwparserfromhell/parser/build_stack.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +__all__ = ["BuildStack"] + +class BuildStack(object): + def __init__(self): + pass + + def write(self, item): + pass + + def push(self): + pass + + def pop(self): + pass diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py new file mode 100644 index 0000000..cedb83f --- /dev/null +++ b/mwparserfromhell/parser/builder.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import re + +from . import tokens +from .build_stack import BuildStack +from ..nodes import Template, Text +from ..nodes.extras import Parameter +from ..smart_list import SmartList +from ..wikicode import Wikicode + +__all__ = ["Builder"] + +class Builder(object): + def __init__(self): + self._tokens = [] + self._stack = BuildStack() + + def _pop(self): + return Wikicode(SmartList(stack.pop())) + + def _handle_parameter(self, key): + showkey = False + self._stack.push() + while self._tokens: + token = self._tokens.pop(0) + if isinstance(token, tokens.TEMPLATE_PARAM_EQUALS): + key = self._pop() + showkey = True + self._stack.push() + elif isinstance(token, (tokens.TEMPLATE_PARAM_SEPARATOR, + tokens.TEMPLATE_CLOSE)): + self._tokens.insert(0, token) + value = self._pop() + return Parameter(key, value, showkey) + else: + self._stack.write(self._handle_token()) + + def _handle_template(self): + params = [] + int_keys = set() + int_key_range = {1} + self._stack.push() + while self._tokens: + token = self._tokens.pop(0) + if isinstance(token, tokens.TEMPLATE_PARAM_SEPARATOR): + if not params: + name = self._pop() + param = self._handle_parameter(min(int_key_range - int_keys)) + if re.match(r"[1-9][0-9]*$", param.key.strip()): + int_keys.add(int(param.key)) + int_key_range.add(len(int_keys) + 1) + params.append(param) + elif isinstance(token, tokens.TEMPLATE_CLOSE): + if not params: + name = self._pop() + return Template(name, params) + else: + self._stack.write(self._handle_token()) + + def _handle_token(self): + token = self._tokens.pop(0) + if isinstance(token, tokens.TEXT): + return Text(token.text) + elif isinstance(token, tokens.TEMPLATE_OPEN): + return self._handle_template() + + def build(self, tokens): + self._tokens = tokens + self._stack.push() + while self._tokens: + self._stack.write(self._handle_token()) + return self._pop() diff --git a/mwparserfromhell/parser/demo.py b/mwparserfromhell/parser/demo.py deleted file mode 100644 index f8ce479..0000000 --- a/mwparserfromhell/parser/demo.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2012 Ben Kurtovic -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -from ..nodes import Template, Text -from ..nodes.extras import Parameter -from ..smart_list import SmartList -from ..wikicode import Wikicode - -__all__ = ["DemoParser"] - -class DemoParser(object): - def __init__(self, text): - self.text = text - - def _tokenize(self): - return [] - - def parse(self): - # Ensure text is unicode! - text = u"This is a {{test}} message with a {{template|with|foo={{params}}}}." - - node1 = Text(u"This is a ") - node2 = Template(Wikicode([Text(u"test")])) - node3 = Text(u" message with a ") - node4_param1_name = Wikicode([Text(u"1")]) - node4_param1_value = Wikicode([Text(u"with")]) - node4_param1 = Parameter(node4_param1_name, node4_param1_value, showkey=False) - node4_param2_name = Wikicode([Text(u"foo")]) - node4_param2_value = Wikicode([Template(Wikicode([Text(u"params")]))]) - node4_param2 = Parameter(node4_param2_name, node4_param2_value, showkey=True) - node4 = Template(Wikicode([Text(u"template")]), [node4_param1, node4_param2]) - node5 = Text(u".") - parsed = Wikicode(SmartList([node1, node2, node3, node4, node5])) - return parsed diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py new file mode 100644 index 0000000..0417489 --- /dev/null +++ b/mwparserfromhell/parser/tokenizer.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from . import tokens + +__all__ = ["Tokenizer"] + +class Tokenizer(object): + def tokenize(self, text): + tokens = [tokens.Text(text=text)] + return tokens diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py new file mode 100644 index 0000000..6228f7d --- /dev/null +++ b/mwparserfromhell/parser/tokens.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +__all__ = ["Token"] + +class Token(object): + def __init__(self, **kwargs): + self.__kwargs = kwargs + + def __getattr__(self, key): + return self.__kwargs[key] + + def __setattr__(self, key, value): + self.__kwargs[key] = value + + def __delattr__(self, key): + del self.__kwargs[key] + + +def make(name): + __all__.append(name) + return type(name, (Token,), {}) + +TEXT = make("TEXT") + +TEMPLATE_OPEN = make("TEMPLATE_OPEN") # {{ +TEMPLATE_PARAM_SEPARATOR = make("TEMPLATE_PARAM_SEPARATOR") # | +TEMPLATE_PARAM_EQUALS = make("TEMPLATE_PARAM_EQUALS") # = +TEMPLATE_CLOSE = make("TEMPLATE_CLOSE") # }} + +HTML_ENTITY_START = make("HTML_ENTITY_START") # & +HTML_ENTITY_NUMERIC = make("HTML_ENTITY_NUMERIC") # # +HTML_ENTITY_HEX = make("HTML_ENTITY_HEX") # x +HTML_ENTITY_END = make("HTML_ENTITY_END") # ; + +HEADING_BLOCK = make("HEADING_BLOCK") # =... + +TAG_OPEN_OPEN = make("TAG_OPEN_OPEN") # < +TAG_ATTR_EQUALS = make("TAG_ATTR_EQUALS") # = +TAG_ATTR_QUOTE = make("TAG_ATTR_QUOTE") # " +TAG_CLOSE_OPEN = make("TAG_CLOSE_OPEN") # > +TAG_CLOSE_SELFCLOSE = make("TAG_CLOSE_SELFCLOSE") # /> +TAG_OPEN_CLOSE = make("TAG_OPEN_CLOSE") # + +del make