浏览代码

Support Headings in tokenizer; handle tokens backwards in builder.

* Some other fixes, additions.
tags/v0.1
Ben Kurtovic 12 年前
父节点
当前提交
e57b6bdd93
共有 5 个文件被更改,包括 165 次插入81 次删除
  1. +5
    -4
      mwparserfromhell/nodes/template.py
  2. +25
    -29
      mwparserfromhell/parser/builder.py
  3. +19
    -4
      mwparserfromhell/parser/contexts.py
  4. +113
    -42
      mwparserfromhell/parser/tokenizer.py
  5. +3
    -2
      mwparserfromhell/parser/tokens.py

+ 5
- 4
mwparserfromhell/nodes/template.py 查看文件

@@ -118,7 +118,7 @@ class Template(Node):

def get(self, name):
name = name.strip() if isinstance(name, basestring) else unicode(name)
for param in self.params:
for param in reversed(self.params):
if param.name.strip() == name:
return param
raise ValueError(name)
@@ -149,8 +149,9 @@ class Template(Node):
else:
int_keys = set()
for param in self.params:
if re.match(r"[1-9][0-9]*$", param.name.strip()):
int_keys.add(int(unicode(param.name)))
if not param.showkey:
if re.match(r"[1-9][0-9]*$", param.name.strip()):
int_keys.add(int(unicode(param.name)))
expected = min(set(range(1, len(int_keys) + 2)) - int_keys)
if expected == int_name:
showkey = False
@@ -170,7 +171,7 @@ class Template(Node):
self.params.append(param)
return param

def remove(self, name, keep_field=False, force_no_field=False):
def remove(self, name, keep_field=False, force_no_field=False): # KEEP FIRST FIELD, REMOVE ALL AFTER
name = name.strip() if isinstance(name, basestring) else unicode(name)
for i, param in enumerate(self.params):
if param.name.strip() == name:


+ 25
- 29
mwparserfromhell/parser/builder.py 查看文件

@@ -20,8 +20,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re

from . import tokens
from ..nodes import Heading, HTMLEntity, Tag, Template, Text
from ..nodes.extras import Attribute, Parameter
@@ -49,42 +47,39 @@ class Builder(object):
def _write(self, item):
self._stacks[-1].append(item)

def _handle_parameter(self, key):
def _handle_parameter(self, default):
key = None
showkey = False
self._push()
while self._tokens:
token = self._tokens.pop(0)
token = self._tokens.pop()
if isinstance(token, tokens.TemplateParamEquals):
key = self._pop()
showkey = True
self._push()
elif isinstance(token, (tokens.TemplateParamSeparator,
tokens.TemplateClose)):
self._tokens.insert(0, token)
self._tokens.append(token)
value = self._pop()
if not key:
key = self._wrap([Text(unicode(default))])
return Parameter(key, value, showkey)
else:
self._write(self._handle_token(token))

def _handle_template(self):
params = []
int_keys = set()
int_key_range = {1}
default = 1
self._push()
while self._tokens:
token = self._tokens.pop(0)
token = self._tokens.pop()
if isinstance(token, tokens.TemplateParamSeparator):
if not params:
name = self._pop()
default = unicode(min(int_key_range - int_keys))
param = self._handle_parameter(self._wrap([Text(default)]))
if re.match(r"[1-9][0-9]*$", param.name.strip()):
# We try a more restrictive test for integers than
# try: int(), because "01" as a key will pass through int()
# correctly but is not a valid integer key in wikicode:
int_keys.add(int(unicode(param.name)))
int_key_range.add(len(int_keys) + 1)
param = self._handle_parameter(default)
params.append(param)
if not param.showkey:
default += 1
elif isinstance(token, tokens.TemplateClose):
if not params:
name = self._pop()
@@ -93,25 +88,25 @@ class Builder(object):
self._write(self._handle_token(token))

def _handle_entity(self):
token = self._tokens.pop(0)
token = self._tokens.pop()
if isinstance(token, tokens.HTMLEntityNumeric):
token = self._tokens.pop(0)
token = self._tokens.pop()
if isinstance(token, tokens.HTMLEntityHex):
text = self._tokens.pop(0)
self._tokens.pop(0) # Remove HTMLEntityEnd
text = self._tokens.pop()
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(text.text, named=False, hexadecimal=True,
hex_char=token.char)
self._tokens.pop(0) # Remove HTMLEntityEnd
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=False, hexadecimal=False)
self._tokens.pop(0) # Remove HTMLEntityEnd
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=True, hexadecimal=False)

def _handle_heading(self, token):
level = token.level
self._push()
while self._tokens:
token = self._tokens.pop(0)
if isinstance(token, tokens.HeadingBlock):
token = self._tokens.pop()
if isinstance(token, tokens.HeadingEnd):
title = self._pop()
return Heading(title, level)
else:
@@ -121,7 +116,7 @@ class Builder(object):
name, quoted = None, False
self._push()
while self._tokens:
token = self._tokens.pop(0)
token = self._tokens.pop()
if isinstance(token, tokens.TagAttrEquals):
name = self._pop()
self._push()
@@ -129,7 +124,7 @@ class Builder(object):
quoted = True
elif isinstance(token, (tokens.TagAttrStart,
tokens.TagCloseOpen)):
self._tokens.insert(0, token)
self._tokens.append(token)
if name is not None:
return Attribute(name, self._pop(), quoted)
return Attribute(self._pop(), quoted=quoted)
@@ -141,7 +136,7 @@ class Builder(object):
attrs = []
self._push()
while self._tokens:
token = self._tokens.pop(0)
token = self._tokens.pop()
if isinstance(token, tokens.TagAttrStart):
attrs.append(self._handle_attribute())
elif isinstance(token, tokens.TagCloseOpen):
@@ -167,15 +162,16 @@ class Builder(object):
return self._handle_template()
elif isinstance(token, tokens.HTMLEntityStart):
return self._handle_entity()
elif isinstance(token, tokens.HeadingBlock):
elif isinstance(token, tokens.HeadingStart):
return self._handle_heading(token)
elif isinstance(token, tokens.TagOpenOpen):
return self._handle_tag(token)

def build(self, tokenlist):
self._tokens = tokenlist
self._tokens.reverse()
self._push()
while self._tokens:
node = self._handle_token(self._tokens.pop(0))
node = self._handle_token(self._tokens.pop())
self._write(node)
return self._pop()

+ 19
- 4
mwparserfromhell/parser/contexts.py 查看文件

@@ -20,7 +20,22 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

TEMPLATE = 0b111
TEMPLATE_NAME = 0b001
TEMPLATE_PARAM_KEY = 0b010
TEMPLATE_PARAM_VALUE = 0b100
# Local (stack-specific) contexts:

TEMPLATE = 0b000000111
TEMPLATE_NAME = 0b000000001
TEMPLATE_PARAM_KEY = 0b000000010
TEMPLATE_PARAM_VALUE = 0b000000100

HEADING = 0b111111000
HEADING_LEVEL_1 = 0b000001000
HEADING_LEVEL_2 = 0b000010000
HEADING_LEVEL_3 = 0b000100000
HEADING_LEVEL_4 = 0b001000000
HEADING_LEVEL_5 = 0b010000000
HEADING_LEVEL_6 = 0b100000000


# Global contexts:

GL_HEADING = 0b1

+ 113
- 42
mwparserfromhell/parser/tokenizer.py 查看文件

@@ -21,6 +21,7 @@
# SOFTWARE.

import htmlentitydefs
from math import log
import re
import string

@@ -32,17 +33,19 @@ __all__ = ["Tokenizer"]
class BadRoute(Exception):
pass


class Tokenizer(object):
START = object()
END = object()
SENTINELS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";",
":", "/", "-", END]
regex = re.compile(r"([{}\[\]<>|=&#*;:/-])", flags=re.IGNORECASE)
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
"/", "-", "\n", END]
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE)

def __init__(self):
self._text = None
self._head = 0
self._stacks = []
self._global = 0

@property
def _stack(self):
@@ -76,6 +79,10 @@ class Tokenizer(object):
self._push_textbuffer()
return self._stacks.pop()[0]

def _fail_route(self):
self._pop()
raise BadRoute()

def _write(self, token):
self._push_textbuffer()
self._stack.append(token)
@@ -84,16 +91,20 @@ class Tokenizer(object):
self._textbuffer.append(text)

def _write_all(self, tokenlist):
if tokenlist and isinstance(tokenlist[0], tokens.Text):
self._write_text(tokenlist.pop(0).text)
self._push_textbuffer()
self._stack.extend(tokenlist)

def _read(self, delta=0, wrap=False):
def _read(self, delta=0, wrap=False, strict=False):
index = self._head + delta
if index < 0 and (not wrap or abs(index) > len(self._text)):
return self.START
try:
return self._text[index]
except IndexError:
if strict:
self._fail_route()
return self.END

def _parse_template(self):
@@ -115,7 +126,7 @@ class Tokenizer(object):
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
text = "".join([token.text for token in text])
if text.strip() and "\n" in text.strip():
raise BadRoute(self._pop())
self._fail_route()

def _handle_template_param(self):
if self._context & contexts.TEMPLATE_NAME:
@@ -137,44 +148,98 @@ class Tokenizer(object):
self._head += 1
return self._pop()

def _parse_entity(self):
def _parse_heading(self):
self._global |= contexts.GL_HEADING
reset = self._head
self._push()
self._head += 1
best = 1
while self._read() == "=":
best += 1
self._head += 1
context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)

try:
self._write(tokens.HTMLEntityStart())
title, level = self._parse(context)
except BadRoute:
self._head = reset + best - 1
self._write_text("=" * best)
else:
self._write(tokens.HeadingStart(level=level))
if level < best:
self._write_text("=" * (best - level))
self._write_all(title)
self._write(tokens.HeadingEnd())
finally:
self._global ^= contexts.GL_HEADING

def _handle_heading_end(self):
reset = self._head
self._head += 1
best = 1
while self._read() == "=":
best += 1
self._head += 1
this = self._read()
if not this or this is self.END:
raise BadRoute(self._pop())
numeric = hexadecimal = False
if this == "#":
numeric = True
self._write(tokens.HTMLEntityNumeric())
self._head += 1
this = self._read()
if not this or this is self.END:
raise BadRoute(self._pop())
if this[0].lower() == "x":
hexadecimal = True
self._write(tokens.HTMLEntityHex(char=this[0]))
this = this[1:]
valid = string.hexdigits if hexadecimal else string.digits
if not numeric and not hexadecimal:
valid += string.ascii_letters
if not all([char in valid for char in this]):
raise BadRoute(self._pop())
current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
level = min(current, min(best, 6))

try:
after, after_level = self._parse(self._context)
except BadRoute:
if level < best:
self._write_text("=" * (best - level))
self._head = reset + best - 1
return self._pop(), level
else:
self._write_text("=" * best)
self._write_all(after)
return self._pop(), after_level

def _really_parse_entity(self):
self._write(tokens.HTMLEntityStart())
self._head += 1

this = self._read(strict=True)
if this == "#":
numeric = True
self._write(tokens.HTMLEntityNumeric())
self._head += 1
if self._read() != ";":
raise BadRoute(self._pop())
if numeric:
test = int(this, 16) if hexadecimal else int(this)
if test < 1 or test > 0x10FFFF:
raise BadRoute(self._pop())
this = self._read(strict=True)
if this[0].lower() == "x":
hexadecimal = True
self._write(tokens.HTMLEntityHex(char=this[0]))
this = this[1:]
if not this:
self._fail_route()
else:
if this not in htmlentitydefs.entitydefs:
raise BadRoute(self._pop())
self._write(tokens.Text(text=this))
self._write(tokens.HTMLEntityEnd())
hexadecimal = False
else:
numeric = hexadecimal = False

valid = string.hexdigits if hexadecimal else string.digits
if not numeric and not hexadecimal:
valid += string.ascii_letters
if not all([char in valid for char in this]):
self._fail_route()

self._head += 1
if self._read() != ";":
self._fail_route()
if numeric:
test = int(this, 16) if hexadecimal else int(this)
if test < 1 or test > 0x10FFFF:
self._fail_route()
else:
if this not in htmlentitydefs.entitydefs:
self._fail_route()

self._write(tokens.Text(text=this))
self._write(tokens.HTMLEntityEnd())

def _parse_entity(self):
reset = self._head
self._push()
try:
self._really_parse_entity()
except BadRoute:
self._head = reset
self._write_text(self._read())
@@ -185,15 +250,15 @@ class Tokenizer(object):
self._push(context)
while True:
this = self._read()
if this not in self.SENTINELS:
if this not in self.MARKERS:
self._write_text(this)
self._head += 1
continue
if this is self.END:
if self._context & contexts.TEMPLATE:
raise BadRoute(self._pop())
if self._context & (contexts.TEMPLATE | contexts.HEADING):
self._fail_route()
return self._pop()
next = self._read(1)
prev, next = self._read(-1), self._read(1)
if this == next == "{":
self._parse_template()
elif this == "|" and self._context & contexts.TEMPLATE:
@@ -202,6 +267,12 @@ class Tokenizer(object):
self._handle_template_param_value()
elif this == next == "}" and self._context & contexts.TEMPLATE:
return self._handle_template_end()
elif (prev == "\n" or prev == self.START) and this == "=" and not self._global & contexts.GL_HEADING:
self._parse_heading()
elif this == "=" and self._context & contexts.HEADING:
return self._handle_heading_end()
elif this == "\n" and self._context & contexts.HEADING:
self._fail_route()
elif this == "&":
self._parse_entity()
else:


+ 3
- 2
mwparserfromhell/parser/tokens.py 查看文件

@@ -29,7 +29,7 @@ class Token(object):
def __repr__(self):
args = []
for key, value in self._kwargs.iteritems():
if len(value) > 100:
if isinstance(value, basestring) and len(value) > 100:
args.append(key + "=" + repr(value[:97] + "..."))
else:
args.append(key + "=" + repr(value))
@@ -66,7 +66,8 @@ HTMLEntityNumeric = make("HTMLEntityNumeric") # #
HTMLEntityHex = make("HTMLEntityHex") # x
HTMLEntityEnd = make("HTMLEntityEnd") # ;

HeadingBlock = make("HeadingBlock") # =...
HeadingStart = make("HeadingStart") # =...
HeadingEnd = make("HeadingEnd") # =...

TagOpenOpen = make("TagOpenOpen") # <
TagAttrStart = make("TagAttrStart")


正在加载...
取消
保存