diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 3c9c798..9e5e568 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -65,15 +65,7 @@ Local (stack-specific) contexts: * :py:const:`TAG` * :py:const:`TAG_OPEN` - - * :py:const:`TAG_OPEN_NAME` - * :py:const:`TAG_OPEN_ATTR` - - * :py:const:`TAG_OPEN_ATTR_NAME` - * :py:const:`TAG_OPEN_ATTR_BODY` - * :py:const:`TAG_OPEN_ATTR_QUOTED` - * :py:const:`TAG_OPEN_ATTR_IGNORE` - + * :py:const:`TAG_ATTR` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -93,47 +85,42 @@ Global contexts: # Local contexts: -TEMPLATE = 0b000000000000000000000000111 -TEMPLATE_NAME = 0b000000000000000000000000001 -TEMPLATE_PARAM_KEY = 0b000000000000000000000000010 -TEMPLATE_PARAM_VALUE = 0b000000000000000000000000100 - -ARGUMENT = 0b000000000000000000000011000 -ARGUMENT_NAME = 0b000000000000000000000001000 -ARGUMENT_DEFAULT = 0b000000000000000000000010000 - -WIKILINK = 0b000000000000000000001100000 -WIKILINK_TITLE = 0b000000000000000000000100000 -WIKILINK_TEXT = 0b000000000000000000001000000 - -HEADING = 0b000000000000001111110000000 -HEADING_LEVEL_1 = 0b000000000000000000010000000 -HEADING_LEVEL_2 = 0b000000000000000000100000000 -HEADING_LEVEL_3 = 0b000000000000000001000000000 -HEADING_LEVEL_4 = 0b000000000000000010000000000 -HEADING_LEVEL_5 = 0b000000000000000100000000000 -HEADING_LEVEL_6 = 0b000000000000001000000000000 - -COMMENT = 0b000000000000010000000000000 - -TAG = 0b000000111111100000000000000 -TAG_OPEN = 0b000000001111100000000000000 -TAG_OPEN_NAME = 0b000000000000100000000000000 -TAG_OPEN_ATTR = 0b000000001111000000000000000 -TAG_OPEN_ATTR_NAME = 0b000000000001000000000000000 -TAG_OPEN_ATTR_BODY = 0b000000000010000000000000000 -TAG_OPEN_ATTR_QUOTED = 0b000000000100000000000000000 -TAG_OPEN_ATTR_IGNORE = 0b000000001000000000000000000 -TAG_BODY = 0b000000010000000000000000000 -TAG_CLOSE = 0b000000100000000000000000000 - -SAFETY_CHECK = 0b111111000000000000000000000 -HAS_TEXT = 0b000001000000000000000000000 -FAIL_ON_TEXT = 0b000010000000000000000000000 -FAIL_NEXT = 0b000100000000000000000000000 -FAIL_ON_LBRACE = 0b001000000000000000000000000 -FAIL_ON_RBRACE = 0b010000000000000000000000000 -FAIL_ON_EQUALS = 0b100000000000000000000000000 +TEMPLATE = 0b000000000000000000000111 +TEMPLATE_NAME = 0b000000000000000000000001 +TEMPLATE_PARAM_KEY = 0b000000000000000000000010 +TEMPLATE_PARAM_VALUE = 0b000000000000000000000100 + +ARGUMENT = 0b000000000000000000011000 +ARGUMENT_NAME = 0b000000000000000000001000 +ARGUMENT_DEFAULT = 0b000000000000000000010000 + +WIKILINK = 0b000000000000000001100000 +WIKILINK_TITLE = 0b000000000000000000100000 +WIKILINK_TEXT = 0b000000000000000001000000 + +HEADING = 0b000000000001111110000000 +HEADING_LEVEL_1 = 0b000000000000000010000000 +HEADING_LEVEL_2 = 0b000000000000000100000000 +HEADING_LEVEL_3 = 0b000000000000001000000000 +HEADING_LEVEL_4 = 0b000000000000010000000000 +HEADING_LEVEL_5 = 0b000000000000100000000000 +HEADING_LEVEL_6 = 0b000000000001000000000000 + +COMMENT = 0b000000000010000000000000 + +TAG = 0b000000111100000000000000 +TAG_OPEN = 0b000000000100000000000000 +TAG_ATTR = 0b000000001000000000000000 +TAG_BODY = 0b000000010000000000000000 +TAG_CLOSE = 0b000000100000000000000000 + +SAFETY_CHECK = 0b111111000000000000000000 +HAS_TEXT = 0b000001000000000000000000 +FAIL_ON_TEXT = 0b000010000000000000000000 +FAIL_NEXT = 0b000100000000000000000000 +FAIL_ON_LBRACE = 0b001000000000000000000000 +FAIL_ON_RBRACE = 0b010000000000000000000000 +FAIL_ON_EQUALS = 0b100000000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 93e9a8d..a7b9e16 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -37,6 +37,26 @@ class BadRoute(Exception): pass +class _TagOpenData(object): + """Stores data about an HTML open tag, like ````.""" + CX_NAME = 1 << 0 + CX_ATTR_READY = 1 << 1 + CX_ATTR_NAME = 1 << 2 + CX_ATTR_VALUE = 1 << 3 + CX_NEED_SPACE = 1 << 4 + CX_NEED_EQUALS = 1 << 5 + CX_NEED_QUOTE = 1 << 6 + CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE + + def __init__(self): + self.context = self.CX_NAME + self.literal = True + self.padding_buffer = [] + self.quote_buffer = [] + self.reset = 0 + self.ignore_quote = False + + class Tokenizer(object): """Creates a list of tokens from a string of wikicode.""" USES_C = False @@ -47,6 +67,7 @@ class Tokenizer(object): MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) + tag_splitter = re.compile(r"([\s\"\\])") def __init__(self): self._text = None @@ -410,165 +431,145 @@ class Tokenizer(object): reset = self._head self._head += 1 try: - tokens = self._parse(contexts.TAG_OPEN_NAME) + tokens = self._really_parse_tag() except BadRoute: self._head = reset self._write_text("<") else: self._write_all(tokens) - def _actually_close_tag_opening(self): - """Handle cleanup at the end of a opening tag. - - The current context will be updated and the - :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the - opening tag's padding to be used in the - :py:class:`~.tokens.TagOpenClose` token. - """ - if self._context & contexts.TAG_OPEN_ATTR: - if self._context & contexts.TAG_OPEN_ATTR_NAME: - self._context ^= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_BODY: - self._context ^= contexts.TAG_OPEN_ATTR_BODY - else: - self._write_first(tokens.TagOpenOpen(showtag=True)) - self._context ^= contexts.TAG_OPEN_NAME - self._context |= contexts.TAG_BODY - - self._push_textbuffer() - if isinstance(self._stack[-1], tokens.TagAttrStart): - return self._stack.pop().padding - return "" - - def _actually_handle_chunk(self, chunks, is_new): - """Actually handle a chunk of code within a tag's attributes. + def _really_parse_tag(self): + """Actually parse an HTML tag, starting with the open (````).""" + data = _TagOpenData() + self._push(contexts.TAG_OPEN) + self._write(tokens.TagOpenOpen(showtag=True)) + while True: + this, next = self._read(), self._read(1) + if this not in self.MARKERS: + for chunk in self.tag_splitter.split(this): + if self._handle_tag_chunk(data, chunk): + continue + elif this is self.END: + if self._context & contexts.TAG_ATTR: + self._pop() + self._fail_route() + elif this == ">" and data.literal: + if data.context & data.CX_ATTR: + self._push_tag_buffer(data) + padding = data.padding_buffer[0] if data.padding_buffer else "" + self._write(tokens.TagCloseOpen(padding=padding)) + self._context = contexts.TAG_BODY + self._head += 1 + return self._parse(push=False) + elif this == "/" and next == ">" and data.literal: + if data.context & data.CX_ATTR: + self._push_tag_buffer(data) + padding = data.padding_buffer[0] if data.padding_buffer else "" + self._write(tokens.TagCloseSelfclose(padding=padding)) + self._head += 1 + return self._pop() + else: + for chunk in self.tag_splitter.split(this): + if self._handle_tag_chunk(data, chunk): + continue + self._head += 1 - Called by :py:meth:`_handle_tag_chunk` and - :py:meth:`_handle_tag_attribute_body`. - """ - if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: - padding = 0 - while chunks: - if chunks[0] == "": - padding += 1 - chunks.pop(0) - else: - break - self._write(tokens.TagAttrStart(padding=" " * padding)) - elif self._context & contexts.TAG_OPEN_ATTR_IGNORE: - self._context ^= contexts.TAG_OPEN_ATTR_IGNORE - chunks.pop(0) + def _handle_tag_chunk(self, data, chunk): + if not chunk: return - elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._write_text(" ") # Quoted chunks don't lose their spaces - - if chunks: - chunk = chunks.pop(0) - if self._context & contexts.TAG_OPEN_ATTR_BODY: - self._context ^= contexts.TAG_OPEN_ATTR_BODY - self._context |= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_QUOTED: - if re.search(r'[^\\]"', chunk[:-1]): - self._fail_route() - if re.search(r'[^\\]"$', chunk): - self._write_text(chunk[:-1]) - self._context ^= contexts.TAG_OPEN_ATTR_QUOTED - self._context |= contexts.TAG_OPEN_ATTR_NAME - return True # Back to _handle_tag_attribute_body() + if data.context & data.CX_NAME: + if chunk != chunk.lstrip(): # Tags cannot start with whitespace + self._fail_route() self._write_text(chunk) - - def _handle_tag_chunk(self, text): - """Handle a chunk of code within a tag's attributes. - - This is called by :py:meth:`_parse`, which intercepts parsing of - wikicode when we're inside of an opening tag and no :py:attr:`MARKERS` - are present. - """ - if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._write_text(text) - return - chunks = text.split(" ") - is_new = False - is_quoted = False - if self._context & contexts.TAG_OPEN_NAME: - self._write_text(chunks.pop(0)) - self._write_first(tokens.TagOpenOpen(showtag=True)) - self._context ^= contexts.TAG_OPEN_NAME - self._context |= contexts.TAG_OPEN_ATTR_NAME - self._actually_handle_chunk(chunks, True) - is_new = True - while chunks: - result = self._actually_handle_chunk(chunks, is_new) - is_quoted = result or is_quoted - is_new = True - if is_quoted: - return self._pop() - - def _handle_tag_attribute_body(self): - """Handle the body, or value, of a tag attribute. - - Attribute bodies can usually be handled at once, but sometimes a new - stack must be created to keep track of "rich" attribute values that - contain, for example, templates. - """ - self._context ^= contexts.TAG_OPEN_ATTR_NAME - self._context |= contexts.TAG_OPEN_ATTR_BODY - self._write(tokens.TagAttrEquals()) - next = self._read(1) - if next not in self.MARKERS and next.startswith('"'): - chunks = None - if " " in next: - chunks = next.split(" ") - next = chunks.pop(0) - if re.search(r'[^\\]"$', next[1:]): - if not re.search(r'[^\\]"', next[1:-1]): - self._write(tokens.TagAttrQuote()) - self._write_text(next[1:-1]) - self._head += 1 + data.context = data.CX_NEED_SPACE + elif data.context & data.CX_NEED_SPACE: + if chunk.isspace(): + if data.context & data.CX_ATTR_VALUE: + self._push_tag_buffer(data) + data.padding_buffer.append(chunk) + data.context = data.CX_ATTR_READY else: - if not re.search(r'[^\\]"', next[1:]): - self._head += 1 - reset = self._head - try: - attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | - contexts.TAG_OPEN_ATTR_IGNORE) - except BadRoute: - self._head = reset - self._write_text(next) - else: - self._write(tokens.TagAttrQuote()) - self._write_text(next[1:]) - self._write_all(attr) - return - self._context ^= contexts.TAG_OPEN_ATTR_BODY - self._context |= contexts.TAG_OPEN_ATTR_NAME - while chunks: - self._actually_handle_chunk(chunks, True) + if data.context & data.CX_ATTR_VALUE: + data.context ^= data.CX_NEED_SPACE + data.quote_buffer = [] + data.ignore_quote = True + self._head = data.reset + return True # Break out of chunk processing early + else: + self._fail_route() + elif data.context & data.CX_ATTR_READY: + if chunk.isspace(): + data.padding_buffer.append(chunk) + else: + data.context = data.CX_ATTR_NAME + self._push(contexts.TAG_ATTR) + self._write_text(chunk) ### hook on here for {, <, etc + elif data.context & data.CX_ATTR_NAME: + if chunk.isspace(): + data.padding_buffer.append(chunk) + data.context |= data.CX_NEED_EQUALS + elif chunk == "=": + if not data.context & data.CX_NEED_EQUALS: + data.padding_buffer.append("") # No padding before equals + data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE + self._write(tokens.TagAttrEquals()) + else: + if data.context & data.CX_NEED_EQUALS: + self._push_tag_buffer(data) + data.padding_buffer.append("") # No padding before tag + data.context = data.CX_ATTR_NAME + self._push(contexts.TAG_ATTR) + self._write_text(chunk) ### hook on here for {, <, etc + elif data.context & data.CX_ATTR_VALUE: + ### handle backslashes here + if data.context & data.CX_NEED_QUOTE: + if chunk == '"' and not data.ignore_quote: + data.context ^= data.CX_NEED_QUOTE + data.literal = False + data.reset = self._head + elif chunk.isspace(): + data.padding_buffer.append(chunk) + else: + data.context ^= data.CX_NEED_QUOTE + self._write_text(chunk) ### hook on here for {, <, etc + elif not data.literal: + if chunk == '"': + data.context |= data.CX_NEED_SPACE + data.literal = True + else: + data.quote_buffer.append(chunk) + elif chunk.isspace(): + self._push_tag_buffer(data) + data.padding_buffer.append(chunk) + data.context = data.CX_ATTR_READY + else: + self._write_text(chunk) ### hook on here for {, <, etc + + def _push_tag_buffer(self, data): + buf = data.padding_buffer + while len(buf) < 3: + buf.append("") + self._write_first(tokens.TagAttrStart( + pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), + pad_first=buf.pop())) + if data.quote_buffer: + self._write(tokens.TagAttrQuote()) + self._write_text("".join(data.quote_buffer)) + self._write_all(self._pop()) + data.padding_buffer, data.quote_buffer = [], [] + data.ignore_quote = False def _get_tag_from_stack(self, stack=None): """Return the tag based on the text in *stack*.""" if not stack: sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen) - func = lambda tok: not isinstance(tok, sentinels) - stack = takewhile(func, self._stack) + pred = lambda tok: not isinstance(tok, sentinels) + stack = takewhile(pred, self._stack) text = [tok.text for tok in stack if isinstance(tok, tokens.Text)] - return "".join(text).rstrip().lower() - - def _handle_tag_close_open(self): - """Handle the ending of an open tag (````).""" - padding = self._actually_close_tag_opening() - if not self._get_tag_from_stack(): # Tags cannot be blank - self._fail_route() - self._write(tokens.TagCloseOpen(padding=padding)) - - def _handle_tag_selfclose(self): - """Handle the ending of an tag that closes itself (````).""" - padding = self._actually_close_tag_opening() - if not self._get_tag_from_stack(): # Tags cannot be blank + try: + return "".join(text).rstrip().lower().split()[0] + except IndexError: self._fail_route() - self._write(tokens.TagCloseSelfclose(padding=padding)) - self._head += 1 - return self._pop() def _handle_tag_open_close(self): """Handle the opening of a closing tag (````).""" @@ -579,10 +580,7 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" closing = self._pop() - close_tag = self._get_tag_from_stack(closing) - open_tag = self._get_tag_from_stack() - if not close_tag or close_tag != open_tag: - # Closing and opening tags are empty or unequal, so fail this tag: + if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): self._fail_route() self._write_all(closing) self._write(tokens.TagCloseClose()) @@ -645,37 +643,30 @@ class Tokenizer(object): self._context |= contexts.FAIL_ON_RBRACE return True - def _parse(self, context=0): + def _parse(self, context=0, push=True): """Parse the wikicode string, using *context* for when to stop.""" - self._push(context) + unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | + contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | + contexts.TAG_CLOSE) + fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | + contexts.HEADING | contexts.COMMENT | contexts.TAG) + double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) + + if push: + self._push(context) while True: this = self._read() - unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | - contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | - contexts.TAG_CLOSE) if self._context & unsafe: if not self._verify_safe(this): - double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) - if self._context & double: + if self._context & double_fail: self._pop() self._fail_route() if this not in self.MARKERS: - if self._context & contexts.TAG_OPEN: - should_exit = self._handle_tag_chunk(this) - if should_exit: - return should_exit - else: - self._write_text(this) + self._write_text(this) self._head += 1 continue if this is self.END: - fail = ( - contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | - contexts.HEADING | contexts.COMMENT | contexts.TAG) if self._context & fail: - double_fail = ( - contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | - contexts.TAG_OPEN_ATTR_QUOTED) if self._context & double_fail: self._pop() self._fail_route() @@ -720,8 +711,6 @@ class Tokenizer(object): elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() - elif self._context & contexts.TAG_OPEN_ATTR_NAME: - self._handle_tag_attribute_body() else: self._write_text("=") elif this == "=" and self._context & contexts.HEADING: @@ -735,22 +724,8 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) - elif this == "<" and next != "/" and ( - not self._context & (contexts.TAG ^ contexts.TAG_BODY)): + elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE: self._parse_tag() - elif self._context & contexts.TAG_OPEN: - if self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._handle_tag_chunk(this) - elif this == "\n": - self._fail_route() - elif this == ">": - self._handle_tag_close_open() - elif this == "/" and next == ">": - return self._handle_tag_selfclose() - elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: - self._handle_tag_attribute_body() - else: - self._handle_tag_chunk(this) elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: