Browse Source

Support comments in tokenizer.

tags/v0.1.1
Ben Kurtovic 11 years ago
parent
commit
65862befdc
4 changed files with 69 additions and 39 deletions
  1. +1
    -1
      mwparserfromhell/nodes/comment.py
  2. +1
    -1
      mwparserfromhell/parser/builder.py
  3. +35
    -31
      mwparserfromhell/parser/contexts.py
  4. +32
    -6
      mwparserfromhell/parser/tokenizer.py

+ 1
- 1
mwparserfromhell/nodes/comment.py View File

@@ -41,6 +41,6 @@ class Comment(Node):
"""The hidden text contained between ``<!--`` and ``-->``."""
return self._contents

@value.setter
@contents.setter
def contents(self, value):
self._contents = str(value)

+ 1
- 1
mwparserfromhell/parser/builder.py View File

@@ -126,7 +126,7 @@ class Builder(object):
self._write(self._handle_token(token))

def _handle_entity(self):
"""Handle a case where a HTML entity is at the head of the tokens."""
"""Handle a case where an HTML entity is at the head of the tokens."""
token = self._tokens.pop()
if isinstance(token, tokens.HTMLEntityNumeric):
token = self._tokens.pop()


+ 35
- 31
mwparserfromhell/parser/contexts.py View File

@@ -35,49 +35,53 @@ will cover ``BAR == 0b10`` and ``BAZ == 0b01``).

Local (stack-specific) contexts:

* :py:const:`TEMPLATE` (``0b00000000111``)
* :py:const:`TEMPLATE`

* :py:const:`TEMPLATE_NAME` (``0b00000000001``)
* :py:const:`TEMPLATE_PARAM_KEY` (``0b00000000010``)
* :py:const:`TEMPLATE_PARAM_VALUE` (``0b00000000100``)
* :py:const:`TEMPLATE_NAME`
* :py:const:`TEMPLATE_PARAM_KEY`
* :py:const:`TEMPLATE_PARAM_VALUE`

* :py:const:`ARGUMENT` (``0b00000011000``)
* :py:const:`ARGUMENT`

* :py:const:`ARGUMENT_NAME` (``0b00000001000``)
* :py:const:`ARGUMENT_DEFAULT` (``0b00000010000``)
* :py:const:`ARGUMENT_NAME`
* :py:const:`ARGUMENT_DEFAULT`

* :py:const:`HEADING` (``0b111111000``)
* :py:const:`HEADING`

* :py:const:`HEADING_LEVEL_1` (``0b00000100000``)
* :py:const:`HEADING_LEVEL_2` (``0b00001000000``)
* :py:const:`HEADING_LEVEL_3` (``0b00010000000``)
* :py:const:`HEADING_LEVEL_4` (``0b00100000000``)
* :py:const:`HEADING_LEVEL_5` (``0b01000000000``)
* :py:const:`HEADING_LEVEL_6` (``0b10000000000``)
* :py:const:`HEADING_LEVEL_1`
* :py:const:`HEADING_LEVEL_2`
* :py:const:`HEADING_LEVEL_3`
* :py:const:`HEADING_LEVEL_4`
* :py:const:`HEADING_LEVEL_5`
* :py:const:`HEADING_LEVEL_6`

* :py:const:`COMMENT`

Global contexts:

* :py:const:`GL_HEADING` (``0b1``)
* :py:const:`GL_HEADING`
"""

# Local contexts:

TEMPLATE = 0b00000000111
TEMPLATE_NAME = 0b00000000001
TEMPLATE_PARAM_KEY = 0b00000000010
TEMPLATE_PARAM_VALUE = 0b00000000100

ARGUMENT = 0b00000011000
ARGUMENT_NAME = 0b00000001000
ARGUMENT_DEFAULT = 0b00000010000

HEADING = 0b11111100000
HEADING_LEVEL_1 = 0b00000100000
HEADING_LEVEL_2 = 0b00001000000
HEADING_LEVEL_3 = 0b00010000000
HEADING_LEVEL_4 = 0b00100000000
HEADING_LEVEL_5 = 0b01000000000
HEADING_LEVEL_6 = 0b10000000000
TEMPLATE = 0b000000000111
TEMPLATE_NAME = 0b000000000001
TEMPLATE_PARAM_KEY = 0b000000000010
TEMPLATE_PARAM_VALUE = 0b000000000100

ARGUMENT = 0b000000011000
ARGUMENT_NAME = 0b000000001000
ARGUMENT_DEFAULT = 0b000000010000

HEADING = 0b011111100000
HEADING_LEVEL_1 = 0b000000100000
HEADING_LEVEL_2 = 0b000001000000
HEADING_LEVEL_3 = 0b000010000000
HEADING_LEVEL_4 = 0b000100000000
HEADING_LEVEL_5 = 0b001000000000
HEADING_LEVEL_6 = 0b010000000000

COMMENT = 0b100000000000


# Global contexts:


+ 32
- 6
mwparserfromhell/parser/tokenizer.py View File

@@ -41,8 +41,8 @@ class Tokenizer(object):
START = object()
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
"/", "-", "\n", END]
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE)
"/", "-", "!", "\n", END]
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)

def __init__(self):
self._text = None
@@ -327,7 +327,7 @@ class Tokenizer(object):
return self._pop(), after_level

def _really_parse_entity(self):
"""Actually parse a HTML entity and ensure that it is valid."""
"""Actually parse an HTML entity and ensure that it is valid."""
self._write(tokens.HTMLEntityStart())
self._head += 1

@@ -369,7 +369,7 @@ class Tokenizer(object):
self._write(tokens.HTMLEntityEnd())

def _parse_entity(self):
"""Parse a HTML entity at the head of the wikicode string."""
"""Parse an HTML entity at the head of the wikicode string."""
reset = self._head
self._push()
try:
@@ -380,6 +380,21 @@ class Tokenizer(object):
else:
self._write_all(self._pop())

def _parse_comment(self):
"""Parse an HTML comment at the head of the wikicode string."""
self._head += 4
reset = self._head - 1
try:
comment = self._parse(contexts.COMMENT)
except BadRoute:
self._head = reset
self._write_text("<!--")
else:
self._write(tokens.CommentStart())
self._write_all(comment)
self._write(tokens.CommentEnd())
self._head += 2

def _parse(self, context=0):
"""Parse the wikicode string, using *context* for when to stop."""
self._push(context)
@@ -390,12 +405,18 @@ class Tokenizer(object):
self._head += 1
continue
if this is self.END:
fail = contexts.TEMPLATE | contexts.ARGUMENT | contexts.HEADING
fail = (contexts.TEMPLATE | contexts.ARGUMENT |
contexts.HEADING | contexts.COMMENT)
if self._context & fail:
self._fail_route()
return self._pop()
next = self._read(1)
if this == next == "{":
if self._context & contexts.COMMENT:
if this == next == "-" and self._read(2) == ">":
return self._pop()
else:
self._write_text(this)
elif this == next == "{":
self._parse_template_or_argument()
elif this == "|" and self._context & contexts.TEMPLATE:
self._handle_template_param()
@@ -421,6 +442,11 @@ class Tokenizer(object):
self._fail_route()
elif this == "&":
self._parse_entity()
elif this == "<" and next == "!":
if self._read(2) == self._read(3) == "-":
self._parse_comment()
else:
self._write_text(this)
else:
self._write_text(this)
self._head += 1


Loading…
Cancel
Save