From 81e5ce30af6f4dbde1e4a426ae0c5eebe2b0ca15 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 13 Aug 2012 19:43:06 -0400 Subject: [PATCH] Working on the framework for the tokenizer, plus some cleanup, fixes. --- mwparserfromhell/nodes/template.py | 2 +- mwparserfromhell/parser/__init__.py | 8 ++++-- mwparserfromhell/parser/builder.py | 56 ++++++++++++++++++------------------ mwparserfromhell/parser/tokenizer.py | 47 +++++++++++++++++++++++++++--- mwparserfromhell/parser/tokens.py | 44 ++++++++++++++-------------- 5 files changed, 100 insertions(+), 57 deletions(-) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 0b65aa7..d77388f 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -90,7 +90,7 @@ class Template(Node): before_theories = defaultdict(lambda: 0) after_theories = defaultdict(lambda: 0) for param in self.params: - match = re.search("^(\s*).*?(\s*)$", unicode(param.value), FLAGS) + match = re.search(r"^(\s*).*?(\s*)$", unicode(param.value), FLAGS) before, after = match.group(1), match.group(2) before_theories[before] += 1 after_theories[after] += 1 diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index f70273f..49ea940 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -20,8 +20,12 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from .builder import Builder -from .tokenizer import Tokenizer +try: + from ._builder import CBuilder as Builder + from ._tokenizer import CTokenizer as Tokenizer +except ImportError: + from .builder import Builder + from .tokenizer import Tokenizer __all__ = ["Parser"] diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 3b6a643..80354a9 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -49,17 +49,17 @@ class Builder(object): self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.TEMPLATE_PARAM_EQUALS): + if isinstance(token, tokens.TemplateParamEquals): key = self._pop() showkey = True self._push() - elif isinstance(token, (tokens.TEMPLATE_PARAM_SEPARATOR, - tokens.TEMPLATE_CLOSE)): + elif isinstance(token, (tokens.TemplateParamSeparator, + tokens.TemplateClose)): self._tokens.insert(0, token) value = self._pop() return Parameter(key, value, showkey) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_template(self): params = [] @@ -68,7 +68,7 @@ class Builder(object): self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.TEMPLATE_PARAM_SEPARATOR): + if isinstance(token, tokens.TemplateParamSeparator): if not params: name = self._pop() param = self._handle_parameter(min(int_key_range - int_keys)) @@ -76,18 +76,18 @@ class Builder(object): int_keys.add(int(param.name)) int_key_range.add(len(int_keys) + 1) params.append(param) - elif isinstance(token, tokens.TEMPLATE_CLOSE): + elif isinstance(token, tokens.TemplateClose): if not params: name = self._pop() return Template(name, params) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_entity(self): token = self._tokens.pop(0) - if isinstance(token, tokens.HTML_ENTITY_NUMERIC): + if isinstance(token, tokens.HTMLEntityNumeric): token = self._tokens.pop(0) - if isinstance(token, tokens.HTML_ENTITY_HEX): + if isinstance(token, tokens.HTMLEntityHex): token = self._tokens.pop(0) return HTMLEntity(token.text, named=False, hexadecimal=True) return HTMLEntity(token.text, named=False, hexadecimal=False) @@ -98,30 +98,30 @@ class Builder(object): self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.HEADING_BLOCK): + if isinstance(token, tokens.HeadingBlock): title = self._pop() return Heading(title, level) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_attribute(self): name, quoted = None, False self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.TAG_ATTR_EQUALS): + if isinstance(token, tokens.TagAttrEquals): name = self._pop() self._push() - elif isinstance(token, tokens.TAG_ATTR_QUOTE): + elif isinstance(token, tokens.TagAttrQuote): quoted = True - elif isinstance(token, (tokens.TAG_ATTR_START, - tokens.TAG_CLOSE_OPEN)): + elif isinstance(token, (tokens.TagAttrStart, + tokens.TagCloseOpen)): self._tokens.insert(0, token) if name is not None: return Attribute(name, self._pop(), quoted) return Attribute(self._pop(), quoted=quoted) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_tag(self, token): type_, showtag = token.type, token.showtag @@ -129,40 +129,40 @@ class Builder(object): self._push() while self._tokens: token = self._tokens.pop(0) - if isinstance(token, tokens.TAG_ATTR_START): + if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute()) - elif isinstance(token, tokens.TAG_CLOSE_OPEN): + elif isinstance(token, tokens.TagCloseOpen): open_pad = token.padding tag = self._pop() self._push() - elif isinstance(token, tokens.TAG_CLOSE_SELFCLOSE): + elif isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() return Tag(type_, tag, attrs=attrs, showtag=showtag, self_closing=True, open_padding=token.padding) - elif isinstance(token, tokens.TAG_OPEN_CLOSE): + elif isinstance(token, tokens.TagOpenClose): contents = self._pop() - elif isinstance(token, tokens.TAG_CLOSE_CLOSE): + elif isinstance(token, tokens.TagCloseClose): return Tag(type_, tag, contents, attrs, showtag, False, open_pad, token.padding) else: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) def _handle_token(self): token = self._tokens.pop(0) - if isinstance(token, tokens.TEXT): + if isinstance(token, tokens.Text): return Text(token.text) - elif isinstance(token, tokens.TEMPLATE_OPEN): + elif isinstance(token, tokens.TemplateOpen): return self._handle_template() - elif isinstance(token, tokens.HTML_ENTITY_START): + elif isinstance(token, tokens.HTMLEntityStart): return self._handle_entity() - elif isinstance(token, tokens.HEADING_BLOCK): + elif isinstance(token, tokens.HeadingBlock): return self._handle_heading(token) - elif isinstance(token, tokens.TAG_OPEN_OPEN): + elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) def build(self, tokenlist): self._tokens = tokenlist self._push() while self._tokens: - self._stack.write(self._handle_token()) + self._write(self._handle_token()) return self._pop() diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 10b4d8a..36c4517 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -25,12 +25,51 @@ from . import tokens __all__ = ["Tokenizer"] class Tokenizer(object): + START = object() + END = object() + def __init__(self): self._text = None self._head = 0 - self._tokens = [] + self._stacks = [] + + self._modifiers = [] + + def _push(self): + self._stacks.append([]) + + def _pop(self): + return self._stacks.pop() + + def _write(self, token, stack=None): + if stack is None: + stack = self._stacks[-1] + if not stack: + stack.append(token) + return + last = stack[-1] + if isinstance(token, tokens.Text) and isinstance(last, tokens.Text): + last.text += token.text + else: + stack.append(token) + + def _read(self, delta=0, wrap=False): + index = self._head + delta + if index < 0 and (not wrap or abs(index) > len(self._text)): + return self.START + if index >= len(self._text): + return self.END + return self._text[index] + + def _parse_until(self, stop): + self._push() + while True: + if self._read() in (stop, self.END): + return self._pop() + else: + self._write(tokens.Text(text=self._read())) + self._head += 1 def tokenize(self, text): - self._text = text - self._tokens.append(tokens.TEXT(text=text)) - return self._tokens + self._text = list(text) + return self._parse_until(stop=self.END) diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 05dbcc9..322b801 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -24,43 +24,43 @@ __all__ = ["Token"] class Token(object): def __init__(self, **kwargs): - self.__kwargs = kwargs + super(Token, self).__setattr__("_kwargs", kwargs) def __getattr__(self, key): - return self.__kwargs[key] + return self._kwargs[key] def __setattr__(self, key, value): - self.__kwargs[key] = value + self._kwargs[key] = value def __delattr__(self, key): - del self.__kwargs[key] + del self._kwargs[key] def make(name): __all__.append(name) return type(name, (Token,), {}) -TEXT = make("TEXT") +Text = make("Text") -TEMPLATE_OPEN = make("TEMPLATE_OPEN") # {{ -TEMPLATE_PARAM_SEPARATOR = make("TEMPLATE_PARAM_SEPARATOR") # | -TEMPLATE_PARAM_EQUALS = make("TEMPLATE_PARAM_EQUALS") # = -TEMPLATE_CLOSE = make("TEMPLATE_CLOSE") # }} +TemplateOpen = make("TemplateOpen") # {{ +TemplateParamSeparator = make("TemplateParamSeparator") # | +TemplateParamEquals = make("TemplateParamEquals") # = +TemplateClose = make("TemplateClose") # }} -HTML_ENTITY_START = make("HTML_ENTITY_START") # & -HTML_ENTITY_NUMERIC = make("HTML_ENTITY_NUMERIC") # # -HTML_ENTITY_HEX = make("HTML_ENTITY_HEX") # x -HTML_ENTITY_END = make("HTML_ENTITY_END") # ; +HTMLEntityStart = make("HTMLEntityStart") # & +HTMLEntityNumeric = make("HTMLEntityNumeric") # # +HTMLEntityHex = make("HTMLEntityHex") # X +HTMLEntityEnd = make("HTMLEntityEnd") # ; -HEADING_BLOCK = make("HEADING_BLOCK") # =... +HeadingBlock = make("HeadingBlock") # =... -TAG_OPEN_OPEN = make("TAG_OPEN_OPEN") # < -TAG_ATTR_START = make("TAG_ATTR_START") -TAG_ATTR_EQUALS = make("TAG_ATTR_EQUALS") # = -TAG_ATTR_QUOTE = make("TAG_ATTR_QUOTE") # " -TAG_CLOSE_OPEN = make("TAG_CLOSE_OPEN") # > -TAG_CLOSE_SELFCLOSE = make("TAG_CLOSE_SELFCLOSE") # /> -TAG_OPEN_CLOSE = make("TAG_OPEN_CLOSE") # +TagOpenOpen = make("TagOpenOpen") # < +TagAttrStart = make("TagAttrStart") +TagAttrEquals = make("TagAttrEquals") # = +TagAttrQuote = make("TagAttrQuote") # " +TagCloseOpen = make("TagCloseOpen") # > +TagCloseSelfclose = make("TagCloseSelfclose") # /> +TagOpenClose = make("TagOpenClose") # del make