Rewrite tag parser to be cleaner and safer.

All tag tests passing. Still need to finish backslash support and support for templates and tags within <open> tags.
11 years ago · 5f5a081d91
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -65,15 +65,7 @@ Local (stack-specific) contexts:
 * :py:const:`TAG`

    * :py:const:`TAG_OPEN`

        * :py:const:`TAG_OPEN_NAME`
        * :py:const:`TAG_OPEN_ATTR`

            * :py:const:`TAG_OPEN_ATTR_NAME`
            * :py:const:`TAG_OPEN_ATTR_BODY`
            * :py:const:`TAG_OPEN_ATTR_QUOTED`
            * :py:const:`TAG_OPEN_ATTR_IGNORE`

    * :py:const:`TAG_ATTR`
    * :py:const:`TAG_BODY`
    * :py:const:`TAG_CLOSE`

@@ -93,47 +85,42 @@ Global contexts:

 # Local contexts:

 TEMPLATE =             0b000000000000000000000000111
 TEMPLATE_NAME =        0b000000000000000000000000001
 TEMPLATE_PARAM_KEY =   0b000000000000000000000000010
 TEMPLATE_PARAM_VALUE = 0b000000000000000000000000100

 ARGUMENT =             0b000000000000000000000011000
 ARGUMENT_NAME =        0b000000000000000000000001000
 ARGUMENT_DEFAULT =     0b000000000000000000000010000

 WIKILINK =             0b000000000000000000001100000
 WIKILINK_TITLE =       0b000000000000000000000100000
 WIKILINK_TEXT =        0b000000000000000000001000000

 HEADING =              0b000000000000001111110000000
 HEADING_LEVEL_1 =      0b000000000000000000010000000
 HEADING_LEVEL_2 =      0b000000000000000000100000000
 HEADING_LEVEL_3 =      0b000000000000000001000000000
 HEADING_LEVEL_4 =      0b000000000000000010000000000
 HEADING_LEVEL_5 =      0b000000000000000100000000000
 HEADING_LEVEL_6 =      0b000000000000001000000000000

 COMMENT =              0b000000000000010000000000000

 TAG =                  0b000000111111100000000000000
 TAG_OPEN =             0b000000001111100000000000000
 TAG_OPEN_NAME =        0b000000000000100000000000000
 TAG_OPEN_ATTR =        0b000000001111000000000000000
 TAG_OPEN_ATTR_NAME =   0b000000000001000000000000000
 TAG_OPEN_ATTR_BODY =   0b000000000010000000000000000
 TAG_OPEN_ATTR_QUOTED = 0b000000000100000000000000000
 TAG_OPEN_ATTR_IGNORE = 0b000000001000000000000000000
 TAG_BODY =             0b000000010000000000000000000
 TAG_CLOSE =            0b000000100000000000000000000

 SAFETY_CHECK =         0b111111000000000000000000000
 HAS_TEXT =             0b000001000000000000000000000
 FAIL_ON_TEXT =         0b000010000000000000000000000
 FAIL_NEXT  =           0b000100000000000000000000000
 FAIL_ON_LBRACE =       0b001000000000000000000000000
 FAIL_ON_RBRACE =       0b010000000000000000000000000
 FAIL_ON_EQUALS =       0b100000000000000000000000000
 TEMPLATE =             0b000000000000000000000111
 TEMPLATE_NAME =        0b000000000000000000000001
 TEMPLATE_PARAM_KEY =   0b000000000000000000000010
 TEMPLATE_PARAM_VALUE = 0b000000000000000000000100

 ARGUMENT =             0b000000000000000000011000
 ARGUMENT_NAME =        0b000000000000000000001000
 ARGUMENT_DEFAULT =     0b000000000000000000010000

 WIKILINK =             0b000000000000000001100000
 WIKILINK_TITLE =       0b000000000000000000100000
 WIKILINK_TEXT =        0b000000000000000001000000

 HEADING =              0b000000000001111110000000
 HEADING_LEVEL_1 =      0b000000000000000010000000
 HEADING_LEVEL_2 =      0b000000000000000100000000
 HEADING_LEVEL_3 =      0b000000000000001000000000
 HEADING_LEVEL_4 =      0b000000000000010000000000
 HEADING_LEVEL_5 =      0b000000000000100000000000
 HEADING_LEVEL_6 =      0b000000000001000000000000

 COMMENT =              0b000000000010000000000000

 TAG =                  0b000000111100000000000000
 TAG_OPEN =             0b000000000100000000000000
 TAG_ATTR =             0b000000001000000000000000
 TAG_BODY =             0b000000010000000000000000
 TAG_CLOSE =            0b000000100000000000000000

 SAFETY_CHECK =         0b111111000000000000000000
 HAS_TEXT =             0b000001000000000000000000
 FAIL_ON_TEXT =         0b000010000000000000000000
 FAIL_NEXT  =           0b000100000000000000000000
 FAIL_ON_LBRACE =       0b001000000000000000000000
 FAIL_ON_RBRACE =       0b010000000000000000000000
 FAIL_ON_EQUALS =       0b100000000000000000000000

 # Global contexts:

--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -37,6 +37,26 @@ class BadRoute(Exception):
    pass


 class _TagOpenData(object):
    """Stores data about an HTML open tag, like ``<ref name="foo">``."""
    CX_NAME =        1 << 0
    CX_ATTR_READY =  1 << 1
    CX_ATTR_NAME =   1 << 2
    CX_ATTR_VALUE =  1 << 3
    CX_NEED_SPACE =  1 << 4
    CX_NEED_EQUALS = 1 << 5
    CX_NEED_QUOTE =  1 << 6
    CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE

    def __init__(self):
        self.context = self.CX_NAME
        self.literal = True
        self.padding_buffer = []
        self.quote_buffer = []
        self.reset = 0
        self.ignore_quote = False


 class Tokenizer(object):
    """Creates a list of tokens from a string of wikicode."""
    USES_C = False
@@ -47,6 +67,7 @@ class Tokenizer(object):
    MAX_DEPTH = 40
    MAX_CYCLES = 100000
    regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
    tag_splitter = re.compile(r"([\s\"\\])")

    def __init__(self):
        self._text = None
@@ -410,165 +431,145 @@ class Tokenizer(object):
        reset = self._head
        self._head += 1
        try:
            tokens = self._parse(contexts.TAG_OPEN_NAME)
            tokens = self._really_parse_tag()
        except BadRoute:
            self._head = reset
            self._write_text("<")
        else:
            self._write_all(tokens)

    def _actually_close_tag_opening(self):
        """Handle cleanup at the end of a opening tag.

        The current context will be updated and the
        :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the
        opening tag's padding to be used in the
        :py:class:`~.tokens.TagOpenClose` token.
        """
        if self._context & contexts.TAG_OPEN_ATTR:
            if self._context & contexts.TAG_OPEN_ATTR_NAME:
                self._context ^= contexts.TAG_OPEN_ATTR_NAME
            if self._context & contexts.TAG_OPEN_ATTR_BODY:
                self._context ^= contexts.TAG_OPEN_ATTR_BODY
        else:
            self._write_first(tokens.TagOpenOpen(showtag=True))
            self._context ^= contexts.TAG_OPEN_NAME
        self._context |= contexts.TAG_BODY

        self._push_textbuffer()
        if isinstance(self._stack[-1], tokens.TagAttrStart):
            return self._stack.pop().padding
        return ""

    def _actually_handle_chunk(self, chunks, is_new):
        """Actually handle a chunk of code within a tag's attributes.
    def _really_parse_tag(self):
        """Actually parse an HTML tag, starting with the open (``<foo>``)."""
        data = _TagOpenData()
        self._push(contexts.TAG_OPEN)
        self._write(tokens.TagOpenOpen(showtag=True))
        while True:
            this, next = self._read(), self._read(1)
            if this not in self.MARKERS:
                for chunk in self.tag_splitter.split(this):
                    if self._handle_tag_chunk(data, chunk):
                        continue
            elif this is self.END:
                if self._context & contexts.TAG_ATTR:
                    self._pop()
                self._fail_route()
            elif this == ">" and data.literal:
                if data.context & data.CX_ATTR:
                    self._push_tag_buffer(data)
                padding = data.padding_buffer[0] if data.padding_buffer else ""
                self._write(tokens.TagCloseOpen(padding=padding))
                self._context = contexts.TAG_BODY
                self._head += 1
                return self._parse(push=False)
            elif this == "/" and next == ">" and data.literal:
                if data.context & data.CX_ATTR:
                    self._push_tag_buffer(data)
                padding = data.padding_buffer[0] if data.padding_buffer else ""
                self._write(tokens.TagCloseSelfclose(padding=padding))
                self._head += 1
                return self._pop()
            else:
                for chunk in self.tag_splitter.split(this):
                    if self._handle_tag_chunk(data, chunk):
                        continue
            self._head += 1

        Called by :py:meth:`_handle_tag_chunk` and
        :py:meth:`_handle_tag_attribute_body`.
        """
        if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
            padding = 0
            while chunks:
                if chunks[0] == "":
                    padding += 1
                    chunks.pop(0)
                else:
                    break
            self._write(tokens.TagAttrStart(padding=" " * padding))
        elif self._context & contexts.TAG_OPEN_ATTR_IGNORE:
            self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
            chunks.pop(0)
    def _handle_tag_chunk(self, data, chunk):
        if not chunk:
            return
        elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED:
            self._write_text(" ")  # Quoted chunks don't lose their spaces

        if chunks:
            chunk = chunks.pop(0)
            if self._context & contexts.TAG_OPEN_ATTR_BODY:
                self._context ^= contexts.TAG_OPEN_ATTR_BODY
                self._context |= contexts.TAG_OPEN_ATTR_NAME
            if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
                if re.search(r'[^\\]"', chunk[:-1]):
                    self._fail_route()
                if re.search(r'[^\\]"$', chunk):
                    self._write_text(chunk[:-1])
                    self._context ^= contexts.TAG_OPEN_ATTR_QUOTED
                    self._context |= contexts.TAG_OPEN_ATTR_NAME
                    return True  # Back to _handle_tag_attribute_body()
        if data.context & data.CX_NAME:
            if chunk != chunk.lstrip():  # Tags cannot start with whitespace
                self._fail_route()
            self._write_text(chunk)

    def _handle_tag_chunk(self, text):
        """Handle a chunk of code within a tag's attributes.

        This is called by :py:meth:`_parse`, which intercepts parsing of
        wikicode when we're inside of an opening tag and no :py:attr:`MARKERS`
        are present.
        """
        if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
            self._write_text(text)
            return
        chunks = text.split(" ")
        is_new = False
        is_quoted = False
        if self._context & contexts.TAG_OPEN_NAME:
            self._write_text(chunks.pop(0))
            self._write_first(tokens.TagOpenOpen(showtag=True))
            self._context ^= contexts.TAG_OPEN_NAME
            self._context |= contexts.TAG_OPEN_ATTR_NAME
            self._actually_handle_chunk(chunks, True)
            is_new = True
        while chunks:
            result = self._actually_handle_chunk(chunks, is_new)
            is_quoted = result or is_quoted
            is_new = True
        if is_quoted:
            return self._pop()

    def _handle_tag_attribute_body(self):
        """Handle the body, or value, of a tag attribute.

        Attribute bodies can usually be handled at once, but sometimes a new
        stack must be created to keep track of "rich" attribute values that
        contain, for example, templates.
        """
        self._context ^= contexts.TAG_OPEN_ATTR_NAME
        self._context |= contexts.TAG_OPEN_ATTR_BODY
        self._write(tokens.TagAttrEquals())
        next = self._read(1)
        if next not in self.MARKERS and next.startswith('"'):
            chunks = None
            if " " in next:
                chunks = next.split(" ")
                next = chunks.pop(0)
            if re.search(r'[^\\]"$', next[1:]):
                if not re.search(r'[^\\]"', next[1:-1]):
                    self._write(tokens.TagAttrQuote())
                    self._write_text(next[1:-1])
                    self._head += 1
            data.context = data.CX_NEED_SPACE
        elif data.context & data.CX_NEED_SPACE:
            if chunk.isspace():
                if data.context & data.CX_ATTR_VALUE:
                    self._push_tag_buffer(data)
                data.padding_buffer.append(chunk)
                data.context = data.CX_ATTR_READY
            else:
                if not re.search(r'[^\\]"', next[1:]):
                    self._head += 1
                    reset = self._head
                    try:
                        attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED |
                                           contexts.TAG_OPEN_ATTR_IGNORE)
                    except BadRoute:
                        self._head = reset
                        self._write_text(next)
                    else:
                        self._write(tokens.TagAttrQuote())
                        self._write_text(next[1:])
                        self._write_all(attr)
                        return
            self._context ^= contexts.TAG_OPEN_ATTR_BODY
            self._context |= contexts.TAG_OPEN_ATTR_NAME
            while chunks:
                self._actually_handle_chunk(chunks, True)
                if data.context & data.CX_ATTR_VALUE:
                    data.context ^= data.CX_NEED_SPACE
                    data.quote_buffer = []
                    data.ignore_quote = True
                    self._head = data.reset
                    return True  # Break out of chunk processing early
                else:
                    self._fail_route()
        elif data.context & data.CX_ATTR_READY:
            if chunk.isspace():
                data.padding_buffer.append(chunk)
            else:
                data.context = data.CX_ATTR_NAME
                self._push(contexts.TAG_ATTR)
                self._write_text(chunk)                        ### hook on here for {, <, etc
        elif data.context & data.CX_ATTR_NAME:
            if chunk.isspace():
                data.padding_buffer.append(chunk)
                data.context |= data.CX_NEED_EQUALS
            elif chunk == "=":
                if not data.context & data.CX_NEED_EQUALS:
                    data.padding_buffer.append("")  # No padding before equals
                data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE
                self._write(tokens.TagAttrEquals())
            else:
                if data.context & data.CX_NEED_EQUALS:
                    self._push_tag_buffer(data)
                    data.padding_buffer.append("")  # No padding before tag
                    data.context = data.CX_ATTR_NAME
                    self._push(contexts.TAG_ATTR)
                self._write_text(chunk)                        ### hook on here for {, <, etc
        elif data.context & data.CX_ATTR_VALUE:
            ### handle backslashes here
            if data.context & data.CX_NEED_QUOTE:
                if chunk == '"' and not data.ignore_quote:
                    data.context ^= data.CX_NEED_QUOTE
                    data.literal = False
                    data.reset = self._head
                elif chunk.isspace():
                    data.padding_buffer.append(chunk)
                else:
                    data.context ^= data.CX_NEED_QUOTE
                    self._write_text(chunk)                    ### hook on here for {, <, etc
            elif not data.literal:
                if chunk == '"':
                    data.context |= data.CX_NEED_SPACE
                    data.literal = True
                else:
                    data.quote_buffer.append(chunk)
            elif chunk.isspace():
                self._push_tag_buffer(data)
                data.padding_buffer.append(chunk)
                data.context = data.CX_ATTR_READY
            else:
                self._write_text(chunk)                        ### hook on here for {, <, etc

    def _push_tag_buffer(self, data):
        buf = data.padding_buffer
        while len(buf) < 3:
            buf.append("")
        self._write_first(tokens.TagAttrStart(
            pad_after_eq=buf.pop(), pad_before_eq=buf.pop(),
            pad_first=buf.pop()))
        if data.quote_buffer:
            self._write(tokens.TagAttrQuote())
            self._write_text("".join(data.quote_buffer))
        self._write_all(self._pop())
        data.padding_buffer, data.quote_buffer = [], []
        data.ignore_quote = False

    def _get_tag_from_stack(self, stack=None):
        """Return the tag based on the text in *stack*."""
        if not stack:
            sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen)
            func = lambda tok: not isinstance(tok, sentinels)
            stack = takewhile(func, self._stack)
            pred = lambda tok: not isinstance(tok, sentinels)
            stack = takewhile(pred, self._stack)
        text = [tok.text for tok in stack if isinstance(tok, tokens.Text)]
        return "".join(text).rstrip().lower()

    def _handle_tag_close_open(self):
        """Handle the ending of an open tag (``<foo>``)."""
        padding = self._actually_close_tag_opening()
        if not self._get_tag_from_stack():  # Tags cannot be blank
            self._fail_route()
        self._write(tokens.TagCloseOpen(padding=padding))

    def _handle_tag_selfclose(self):
        """Handle the ending of an tag that closes itself (``<foo />``)."""
        padding = self._actually_close_tag_opening()
        if not self._get_tag_from_stack():  # Tags cannot be blank
        try:
            return "".join(text).rstrip().lower().split()[0]
        except IndexError:
            self._fail_route()
        self._write(tokens.TagCloseSelfclose(padding=padding))
        self._head += 1
        return self._pop()

    def _handle_tag_open_close(self):
        """Handle the opening of a closing tag (``</foo>``)."""
@@ -579,10 +580,7 @@ class Tokenizer(object):
    def _handle_tag_close_close(self):
        """Handle the ending of a closing tag (``</foo>``)."""
        closing = self._pop()
        close_tag = self._get_tag_from_stack(closing)
        open_tag = self._get_tag_from_stack()
        if not close_tag or close_tag != open_tag:
            # Closing and opening tags are empty or unequal, so fail this tag:
        if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
            self._fail_route()
        self._write_all(closing)
        self._write(tokens.TagCloseClose())
@@ -645,37 +643,30 @@ class Tokenizer(object):
                self._context |= contexts.FAIL_ON_RBRACE
            return True

    def _parse(self, context=0):
    def _parse(self, context=0, push=True):
        """Parse the wikicode string, using *context* for when to stop."""
        self._push(context)
        unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
                  contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
                  contexts.TAG_CLOSE)
        fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
                contexts.HEADING | contexts.COMMENT | contexts.TAG)
        double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)

        if push:
            self._push(context)
        while True:
            this = self._read()
            unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
                      contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
                      contexts.TAG_CLOSE)
            if self._context & unsafe:
                if not self._verify_safe(this):
                    double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
                    if self._context & double:
                    if self._context & double_fail:
                        self._pop()
                    self._fail_route()
            if this not in self.MARKERS:
                if self._context & contexts.TAG_OPEN:
                    should_exit = self._handle_tag_chunk(this)
                    if should_exit:
                        return should_exit
                else:
                    self._write_text(this)
                self._write_text(this)
                self._head += 1
                continue
            if this is self.END:
                fail = (
                    contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
                    contexts.HEADING | contexts.COMMENT | contexts.TAG)
                if self._context & fail:
                    double_fail = (
                        contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
                        contexts.TAG_OPEN_ATTR_QUOTED)
                    if self._context & double_fail:
                        self._pop()
                    self._fail_route()
@@ -720,8 +711,6 @@ class Tokenizer(object):
            elif this == "=" and not self._global & contexts.GL_HEADING:
                if self._read(-1) in ("\n", self.START):
                    self._parse_heading()
                elif self._context & contexts.TAG_OPEN_ATTR_NAME:
                    self._handle_tag_attribute_body()
                else:
                    self._write_text("=")
            elif this == "=" and self._context & contexts.HEADING:
@@ -735,22 +724,8 @@ class Tokenizer(object):
                    self._parse_comment()
                else:
                    self._write_text(this)
            elif this == "<" and next != "/" and (
                    not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
            elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE:
                self._parse_tag()
            elif self._context & contexts.TAG_OPEN:
                if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
                    self._handle_tag_chunk(this)
                elif this == "\n":
                    self._fail_route()
                elif this == ">":
                    self._handle_tag_close_open()
                elif this == "/" and next == ">":
                    return self._handle_tag_selfclose()
                elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME:
                    self._handle_tag_attribute_body()
                else:
                    self._handle_tag_chunk(this)
            elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
                self._handle_tag_open_close()
            elif this == ">" and self._context & contexts.TAG_CLOSE: