Kaynağa Gözat

Really basic, messy, and fragile tag attribute support.

tags/v0.3
Ben Kurtovic 12 yıl önce
ebeveyn
işleme
d9f23b8faa
2 değiştirilmiş dosya ile 75 ekleme ve 63 silme
  1. +38
    -35
      mwparserfromhell/parser/contexts.py
  2. +37
    -28
      mwparserfromhell/parser/tokenizer.py

+ 38
- 35
mwparserfromhell/parser/contexts.py Dosyayı Görüntüle

@@ -65,11 +65,13 @@ Local (stack-specific) contexts:
* :py:const:`TAG`

* :py:const:`TAG_OPEN`
* :py:const:`TAG_ATTR`

* :py:const:`TAG_ATTR_NAME`
* :py:const:`TAG_ATTR_BODY`
* :py:const:`TAG_ATTR_BODY_QUOTED`
* :py:const:`TAG_OPEN_NAME`
* :py:const:`TAG_OPEN_ATTR`

* :py:const:`TAG_OPEN_ATTR_NAME`
* :py:const:`TAG_OPEN_ATTR_BODY`
* :py:const:`TAG_OPEN_ATTR_BODY_QUOTED`

* :py:const:`TAG_BODY`
* :py:const:`TAG_CLOSE`
@@ -81,37 +83,38 @@ Global contexts:

# Local contexts:

TEMPLATE = 0b00000000000000000111
TEMPLATE_NAME = 0b00000000000000000001
TEMPLATE_PARAM_KEY = 0b00000000000000000010
TEMPLATE_PARAM_VALUE = 0b00000000000000000100

ARGUMENT = 0b00000000000000011000
ARGUMENT_NAME = 0b00000000000000001000
ARGUMENT_DEFAULT = 0b00000000000000010000

WIKILINK = 0b00000000000001100000
WIKILINK_TITLE = 0b00000000000000100000
WIKILINK_TEXT = 0b00000000000001000000

HEADING = 0b00000001111110000000
HEADING_LEVEL_1 = 0b00000000000010000000
HEADING_LEVEL_2 = 0b00000000000100000000
HEADING_LEVEL_3 = 0b00000000001000000000
HEADING_LEVEL_4 = 0b00000000010000000000
HEADING_LEVEL_5 = 0b00000000100000000000
HEADING_LEVEL_6 = 0b00000001000000000000

COMMENT = 0b00000010000000000000

TAG = 0b11111100000000000000
TAG_OPEN = 0b00000100000000000000
TAG_ATTR = 0b00111000000000000000
TAG_ATTR_NAME = 0b00001000000000000000
TAG_ATTR_BODY = 0b00010000000000000000
TAG_ATTR_BODY_QUOTED = 0b00100000000000000000
TAG_BODY = 0b01000000000000000000
TAG_CLOSE = 0b10000000000000000000
TEMPLATE = 0b00000000000000000111
TEMPLATE_NAME = 0b00000000000000000001
TEMPLATE_PARAM_KEY = 0b00000000000000000010
TEMPLATE_PARAM_VALUE = 0b00000000000000000100

ARGUMENT = 0b00000000000000011000
ARGUMENT_NAME = 0b00000000000000001000
ARGUMENT_DEFAULT = 0b00000000000000010000

WIKILINK = 0b00000000000001100000
WIKILINK_TITLE = 0b00000000000000100000
WIKILINK_TEXT = 0b00000000000001000000

HEADING = 0b00000001111110000000
HEADING_LEVEL_1 = 0b00000000000010000000
HEADING_LEVEL_2 = 0b00000000000100000000
HEADING_LEVEL_3 = 0b00000000001000000000
HEADING_LEVEL_4 = 0b00000000010000000000
HEADING_LEVEL_5 = 0b00000000100000000000
HEADING_LEVEL_6 = 0b00000001000000000000

COMMENT = 0b00000010000000000000

TAG = 0b11111100000000000000
TAG_OPEN = 0b00111100000000000000
TAG_OPEN_NAME = 0b00000100000000000000
TAG_OPEN_ATTR = 0b00111000000000000000
TAG_OPEN_ATTR_NAME = 0b00001000000000000000
TAG_OPEN_ATTR_BODY = 0b00010000000000000000
TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000
TAG_BODY = 0b01000000000000000000
TAG_CLOSE = 0b10000000000000000000


# Global contexts:


+ 37
- 28
mwparserfromhell/parser/tokenizer.py Dosyayı Görüntüle

@@ -426,7 +426,7 @@ class Tokenizer(object):
reset = self._head
self._head += 1
try:
tokens = self._parse(contexts.TAG_OPEN)
tokens = self._parse(contexts.TAG_OPEN_NAME)
except BadRoute:
self._head = reset
self._write_text("<")
@@ -438,34 +438,48 @@ class Tokenizer(object):
if not self._stack:
return None # Tag has an empty name?
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
text = "".join([token.text for token in text]).strip().lower()
text = "".join([token.text for token in text]).rstrip().lower()
try:
return Tag.TRANSLATIONS[text]
except KeyError:
return Tag.TAG_UNKNOWN

def _actually_close_tag_opening(self):
if self._context & contexts.TAG_ATTR:
if self._context & contexts.TAG_ATTR_BODY:
self._context ^= contexts.TAG_ATTR_BODY
if self._context & contexts.TAG_ATTR_BODY_QUOTED:
self._context ^= contexts.TAG_ATTR_BODY_QUOTED
else:
self._context ^= contexts.TAG_ATTR_NAME
if self._context & contexts.TAG_OPEN_ATTR:
if self._context & contexts.TAG_OPEN_ATTR_NAME:
self._context ^= contexts.TAG_OPEN_ATTR_NAME
if self._context & contexts.TAG_OPEN_ATTR_BODY:
self._context ^= contexts.TAG_OPEN_ATTR_BODY
if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED
else:
tag = self._get_tag_type_from_stack()
if tag is None:
if not tag:
self._fail_route()
self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))

self._context ^= contexts.TAG_OPEN
self._context ^= contexts.TAG_OPEN_NAME
self._context |= contexts.TAG_BODY
padding = "" # TODO
return padding

# def _handle_attribute(self):
# if not self._context & contexts.TAG_ATTR:
# self._handle_tag_close_name()
def _handle_tag_chunk(self, text):
if " " not in text:
self._write_text(text)
return
chunks = text.split(" ")
if self._context & contexts.TAG_OPEN_NAME:
self._write_text(chunks.pop(0))
tag = self._get_tag_type_from_stack()
if not tag:
self._fail_route()
self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
self._context ^= contexts.TAG_OPEN_NAME
self._context |= contexts.TAG_OPEN_ATTR_NAME
self._write(tokens.TagAttrStart())
for i, chunk in enumerate(chunks):
if i > 0:
self._write(tokens.TagAttrStart())
self._write_text(chunk)

# def _handle_attribute_name(self):
# ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED
@@ -505,7 +519,10 @@ class Tokenizer(object):
while True:
this = self._read()
if this not in self.MARKERS:
self._write_text(this)
if self._context & contexts.TAG_OPEN:
self._handle_tag_chunk(this)
else:
self._write_text(this)
self._head += 1
continue
if this is self.END:
@@ -567,25 +584,17 @@ class Tokenizer(object):
elif this == "<" and next != "/" and (
not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
self._parse_tag()
# elif this == " " and (self._context & contexts.TAG_OPEN and not
# self._context & contexts.TAG_ATTR_BODY_QUOTED):
# self._handle_attribute()
# elif this == "=" and self._context & contexts.TAG_ATTR_NAME:
# self._handle_attribute_name()
# elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED:
# self._handle_quoted_attribute_close()
elif self._context & contexts.TAG_OPEN and (
not self._context & contexts.TAG_ATTR_BODY_QUOTED):
elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED):
if this == "\n":
if self._context & contexts.TAG_CLOSE:
self._pop()
self._fail_route()
elif this == ">":
self._handle_tag_close_open()
elif this == "/":
elif this == "/" and next == ">":
return self._handle_tag_selfclose()
else:
self._write_text(this)
# elif this == "=":
# self._handle_tag_attr_body()
elif this == "<" and next == "/" and (
self._context & contexts.TAG_BODY):
self._handle_tag_open_close()


Yükleniyor…
İptal
Kaydet