* Some other fixes, additions.tags/v0.1
@@ -118,7 +118,7 @@ class Template(Node): | |||
def get(self, name): | |||
name = name.strip() if isinstance(name, basestring) else unicode(name) | |||
for param in self.params: | |||
for param in reversed(self.params): | |||
if param.name.strip() == name: | |||
return param | |||
raise ValueError(name) | |||
@@ -149,8 +149,9 @@ class Template(Node): | |||
else: | |||
int_keys = set() | |||
for param in self.params: | |||
if re.match(r"[1-9][0-9]*$", param.name.strip()): | |||
int_keys.add(int(unicode(param.name))) | |||
if not param.showkey: | |||
if re.match(r"[1-9][0-9]*$", param.name.strip()): | |||
int_keys.add(int(unicode(param.name))) | |||
expected = min(set(range(1, len(int_keys) + 2)) - int_keys) | |||
if expected == int_name: | |||
showkey = False | |||
@@ -170,7 +171,7 @@ class Template(Node): | |||
self.params.append(param) | |||
return param | |||
def remove(self, name, keep_field=False, force_no_field=False): | |||
def remove(self, name, keep_field=False, force_no_field=False): # KEEP FIRST FIELD, REMOVE ALL AFTER | |||
name = name.strip() if isinstance(name, basestring) else unicode(name) | |||
for i, param in enumerate(self.params): | |||
if param.name.strip() == name: | |||
@@ -20,8 +20,6 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import re | |||
from . import tokens | |||
from ..nodes import Heading, HTMLEntity, Tag, Template, Text | |||
from ..nodes.extras import Attribute, Parameter | |||
@@ -49,42 +47,39 @@ class Builder(object): | |||
def _write(self, item): | |||
self._stacks[-1].append(item) | |||
def _handle_parameter(self, key): | |||
def _handle_parameter(self, default): | |||
key = None | |||
showkey = False | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop(0) | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.TemplateParamEquals): | |||
key = self._pop() | |||
showkey = True | |||
self._push() | |||
elif isinstance(token, (tokens.TemplateParamSeparator, | |||
tokens.TemplateClose)): | |||
self._tokens.insert(0, token) | |||
self._tokens.append(token) | |||
value = self._pop() | |||
if not key: | |||
key = self._wrap([Text(unicode(default))]) | |||
return Parameter(key, value, showkey) | |||
else: | |||
self._write(self._handle_token(token)) | |||
def _handle_template(self): | |||
params = [] | |||
int_keys = set() | |||
int_key_range = {1} | |||
default = 1 | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop(0) | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.TemplateParamSeparator): | |||
if not params: | |||
name = self._pop() | |||
default = unicode(min(int_key_range - int_keys)) | |||
param = self._handle_parameter(self._wrap([Text(default)])) | |||
if re.match(r"[1-9][0-9]*$", param.name.strip()): | |||
# We try a more restrictive test for integers than | |||
# try: int(), because "01" as a key will pass through int() | |||
# correctly but is not a valid integer key in wikicode: | |||
int_keys.add(int(unicode(param.name))) | |||
int_key_range.add(len(int_keys) + 1) | |||
param = self._handle_parameter(default) | |||
params.append(param) | |||
if not param.showkey: | |||
default += 1 | |||
elif isinstance(token, tokens.TemplateClose): | |||
if not params: | |||
name = self._pop() | |||
@@ -93,25 +88,25 @@ class Builder(object): | |||
self._write(self._handle_token(token)) | |||
def _handle_entity(self): | |||
token = self._tokens.pop(0) | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.HTMLEntityNumeric): | |||
token = self._tokens.pop(0) | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.HTMLEntityHex): | |||
text = self._tokens.pop(0) | |||
self._tokens.pop(0) # Remove HTMLEntityEnd | |||
text = self._tokens.pop() | |||
self._tokens.pop() # Remove HTMLEntityEnd | |||
return HTMLEntity(text.text, named=False, hexadecimal=True, | |||
hex_char=token.char) | |||
self._tokens.pop(0) # Remove HTMLEntityEnd | |||
self._tokens.pop() # Remove HTMLEntityEnd | |||
return HTMLEntity(token.text, named=False, hexadecimal=False) | |||
self._tokens.pop(0) # Remove HTMLEntityEnd | |||
self._tokens.pop() # Remove HTMLEntityEnd | |||
return HTMLEntity(token.text, named=True, hexadecimal=False) | |||
def _handle_heading(self, token): | |||
level = token.level | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop(0) | |||
if isinstance(token, tokens.HeadingBlock): | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.HeadingEnd): | |||
title = self._pop() | |||
return Heading(title, level) | |||
else: | |||
@@ -121,7 +116,7 @@ class Builder(object): | |||
name, quoted = None, False | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop(0) | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.TagAttrEquals): | |||
name = self._pop() | |||
self._push() | |||
@@ -129,7 +124,7 @@ class Builder(object): | |||
quoted = True | |||
elif isinstance(token, (tokens.TagAttrStart, | |||
tokens.TagCloseOpen)): | |||
self._tokens.insert(0, token) | |||
self._tokens.append(token) | |||
if name is not None: | |||
return Attribute(name, self._pop(), quoted) | |||
return Attribute(self._pop(), quoted=quoted) | |||
@@ -141,7 +136,7 @@ class Builder(object): | |||
attrs = [] | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop(0) | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.TagAttrStart): | |||
attrs.append(self._handle_attribute()) | |||
elif isinstance(token, tokens.TagCloseOpen): | |||
@@ -167,15 +162,16 @@ class Builder(object): | |||
return self._handle_template() | |||
elif isinstance(token, tokens.HTMLEntityStart): | |||
return self._handle_entity() | |||
elif isinstance(token, tokens.HeadingBlock): | |||
elif isinstance(token, tokens.HeadingStart): | |||
return self._handle_heading(token) | |||
elif isinstance(token, tokens.TagOpenOpen): | |||
return self._handle_tag(token) | |||
def build(self, tokenlist): | |||
self._tokens = tokenlist | |||
self._tokens.reverse() | |||
self._push() | |||
while self._tokens: | |||
node = self._handle_token(self._tokens.pop(0)) | |||
node = self._handle_token(self._tokens.pop()) | |||
self._write(node) | |||
return self._pop() |
@@ -20,7 +20,22 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
TEMPLATE = 0b111 | |||
TEMPLATE_NAME = 0b001 | |||
TEMPLATE_PARAM_KEY = 0b010 | |||
TEMPLATE_PARAM_VALUE = 0b100 | |||
# Local (stack-specific) contexts: | |||
TEMPLATE = 0b000000111 | |||
TEMPLATE_NAME = 0b000000001 | |||
TEMPLATE_PARAM_KEY = 0b000000010 | |||
TEMPLATE_PARAM_VALUE = 0b000000100 | |||
HEADING = 0b111111000 | |||
HEADING_LEVEL_1 = 0b000001000 | |||
HEADING_LEVEL_2 = 0b000010000 | |||
HEADING_LEVEL_3 = 0b000100000 | |||
HEADING_LEVEL_4 = 0b001000000 | |||
HEADING_LEVEL_5 = 0b010000000 | |||
HEADING_LEVEL_6 = 0b100000000 | |||
# Global contexts: | |||
GL_HEADING = 0b1 |
@@ -21,6 +21,7 @@ | |||
# SOFTWARE. | |||
import htmlentitydefs | |||
from math import log | |||
import re | |||
import string | |||
@@ -32,17 +33,19 @@ __all__ = ["Tokenizer"] | |||
class BadRoute(Exception): | |||
pass | |||
class Tokenizer(object): | |||
START = object() | |||
END = object() | |||
SENTINELS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", | |||
":", "/", "-", END] | |||
regex = re.compile(r"([{}\[\]<>|=&#*;:/-])", flags=re.IGNORECASE) | |||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", | |||
"/", "-", "\n", END] | |||
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE) | |||
def __init__(self): | |||
self._text = None | |||
self._head = 0 | |||
self._stacks = [] | |||
self._global = 0 | |||
@property | |||
def _stack(self): | |||
@@ -76,6 +79,10 @@ class Tokenizer(object): | |||
self._push_textbuffer() | |||
return self._stacks.pop()[0] | |||
def _fail_route(self): | |||
self._pop() | |||
raise BadRoute() | |||
def _write(self, token): | |||
self._push_textbuffer() | |||
self._stack.append(token) | |||
@@ -84,16 +91,20 @@ class Tokenizer(object): | |||
self._textbuffer.append(text) | |||
def _write_all(self, tokenlist): | |||
if tokenlist and isinstance(tokenlist[0], tokens.Text): | |||
self._write_text(tokenlist.pop(0).text) | |||
self._push_textbuffer() | |||
self._stack.extend(tokenlist) | |||
def _read(self, delta=0, wrap=False): | |||
def _read(self, delta=0, wrap=False, strict=False): | |||
index = self._head + delta | |||
if index < 0 and (not wrap or abs(index) > len(self._text)): | |||
return self.START | |||
try: | |||
return self._text[index] | |||
except IndexError: | |||
if strict: | |||
self._fail_route() | |||
return self.END | |||
def _parse_template(self): | |||
@@ -115,7 +126,7 @@ class Tokenizer(object): | |||
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] | |||
text = "".join([token.text for token in text]) | |||
if text.strip() and "\n" in text.strip(): | |||
raise BadRoute(self._pop()) | |||
self._fail_route() | |||
def _handle_template_param(self): | |||
if self._context & contexts.TEMPLATE_NAME: | |||
@@ -137,44 +148,98 @@ class Tokenizer(object): | |||
self._head += 1 | |||
return self._pop() | |||
def _parse_entity(self): | |||
def _parse_heading(self): | |||
self._global |= contexts.GL_HEADING | |||
reset = self._head | |||
self._push() | |||
self._head += 1 | |||
best = 1 | |||
while self._read() == "=": | |||
best += 1 | |||
self._head += 1 | |||
context = contexts.HEADING_LEVEL_1 << min(best - 1, 5) | |||
try: | |||
self._write(tokens.HTMLEntityStart()) | |||
title, level = self._parse(context) | |||
except BadRoute: | |||
self._head = reset + best - 1 | |||
self._write_text("=" * best) | |||
else: | |||
self._write(tokens.HeadingStart(level=level)) | |||
if level < best: | |||
self._write_text("=" * (best - level)) | |||
self._write_all(title) | |||
self._write(tokens.HeadingEnd()) | |||
finally: | |||
self._global ^= contexts.GL_HEADING | |||
def _handle_heading_end(self): | |||
reset = self._head | |||
self._head += 1 | |||
best = 1 | |||
while self._read() == "=": | |||
best += 1 | |||
self._head += 1 | |||
this = self._read() | |||
if not this or this is self.END: | |||
raise BadRoute(self._pop()) | |||
numeric = hexadecimal = False | |||
if this == "#": | |||
numeric = True | |||
self._write(tokens.HTMLEntityNumeric()) | |||
self._head += 1 | |||
this = self._read() | |||
if not this or this is self.END: | |||
raise BadRoute(self._pop()) | |||
if this[0].lower() == "x": | |||
hexadecimal = True | |||
self._write(tokens.HTMLEntityHex(char=this[0])) | |||
this = this[1:] | |||
valid = string.hexdigits if hexadecimal else string.digits | |||
if not numeric and not hexadecimal: | |||
valid += string.ascii_letters | |||
if not all([char in valid for char in this]): | |||
raise BadRoute(self._pop()) | |||
current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1 | |||
level = min(current, min(best, 6)) | |||
try: | |||
after, after_level = self._parse(self._context) | |||
except BadRoute: | |||
if level < best: | |||
self._write_text("=" * (best - level)) | |||
self._head = reset + best - 1 | |||
return self._pop(), level | |||
else: | |||
self._write_text("=" * best) | |||
self._write_all(after) | |||
return self._pop(), after_level | |||
def _really_parse_entity(self): | |||
self._write(tokens.HTMLEntityStart()) | |||
self._head += 1 | |||
this = self._read(strict=True) | |||
if this == "#": | |||
numeric = True | |||
self._write(tokens.HTMLEntityNumeric()) | |||
self._head += 1 | |||
if self._read() != ";": | |||
raise BadRoute(self._pop()) | |||
if numeric: | |||
test = int(this, 16) if hexadecimal else int(this) | |||
if test < 1 or test > 0x10FFFF: | |||
raise BadRoute(self._pop()) | |||
this = self._read(strict=True) | |||
if this[0].lower() == "x": | |||
hexadecimal = True | |||
self._write(tokens.HTMLEntityHex(char=this[0])) | |||
this = this[1:] | |||
if not this: | |||
self._fail_route() | |||
else: | |||
if this not in htmlentitydefs.entitydefs: | |||
raise BadRoute(self._pop()) | |||
self._write(tokens.Text(text=this)) | |||
self._write(tokens.HTMLEntityEnd()) | |||
hexadecimal = False | |||
else: | |||
numeric = hexadecimal = False | |||
valid = string.hexdigits if hexadecimal else string.digits | |||
if not numeric and not hexadecimal: | |||
valid += string.ascii_letters | |||
if not all([char in valid for char in this]): | |||
self._fail_route() | |||
self._head += 1 | |||
if self._read() != ";": | |||
self._fail_route() | |||
if numeric: | |||
test = int(this, 16) if hexadecimal else int(this) | |||
if test < 1 or test > 0x10FFFF: | |||
self._fail_route() | |||
else: | |||
if this not in htmlentitydefs.entitydefs: | |||
self._fail_route() | |||
self._write(tokens.Text(text=this)) | |||
self._write(tokens.HTMLEntityEnd()) | |||
def _parse_entity(self): | |||
reset = self._head | |||
self._push() | |||
try: | |||
self._really_parse_entity() | |||
except BadRoute: | |||
self._head = reset | |||
self._write_text(self._read()) | |||
@@ -185,15 +250,15 @@ class Tokenizer(object): | |||
self._push(context) | |||
while True: | |||
this = self._read() | |||
if this not in self.SENTINELS: | |||
if this not in self.MARKERS: | |||
self._write_text(this) | |||
self._head += 1 | |||
continue | |||
if this is self.END: | |||
if self._context & contexts.TEMPLATE: | |||
raise BadRoute(self._pop()) | |||
if self._context & (contexts.TEMPLATE | contexts.HEADING): | |||
self._fail_route() | |||
return self._pop() | |||
next = self._read(1) | |||
prev, next = self._read(-1), self._read(1) | |||
if this == next == "{": | |||
self._parse_template() | |||
elif this == "|" and self._context & contexts.TEMPLATE: | |||
@@ -202,6 +267,12 @@ class Tokenizer(object): | |||
self._handle_template_param_value() | |||
elif this == next == "}" and self._context & contexts.TEMPLATE: | |||
return self._handle_template_end() | |||
elif (prev == "\n" or prev == self.START) and this == "=" and not self._global & contexts.GL_HEADING: | |||
self._parse_heading() | |||
elif this == "=" and self._context & contexts.HEADING: | |||
return self._handle_heading_end() | |||
elif this == "\n" and self._context & contexts.HEADING: | |||
self._fail_route() | |||
elif this == "&": | |||
self._parse_entity() | |||
else: | |||
@@ -29,7 +29,7 @@ class Token(object): | |||
def __repr__(self): | |||
args = [] | |||
for key, value in self._kwargs.iteritems(): | |||
if len(value) > 100: | |||
if isinstance(value, basestring) and len(value) > 100: | |||
args.append(key + "=" + repr(value[:97] + "...")) | |||
else: | |||
args.append(key + "=" + repr(value)) | |||
@@ -66,7 +66,8 @@ HTMLEntityNumeric = make("HTMLEntityNumeric") # # | |||
HTMLEntityHex = make("HTMLEntityHex") # x | |||
HTMLEntityEnd = make("HTMLEntityEnd") # ; | |||
HeadingBlock = make("HeadingBlock") # =... | |||
HeadingStart = make("HeadingStart") # =... | |||
HeadingEnd = make("HeadingEnd") # =... | |||
TagOpenOpen = make("TagOpenOpen") # < | |||
TagAttrStart = make("TagAttrStart") | |||