* Some other fixes, additions.tags/v0.1
@@ -118,7 +118,7 @@ class Template(Node): | |||||
def get(self, name): | def get(self, name): | ||||
name = name.strip() if isinstance(name, basestring) else unicode(name) | name = name.strip() if isinstance(name, basestring) else unicode(name) | ||||
for param in self.params: | |||||
for param in reversed(self.params): | |||||
if param.name.strip() == name: | if param.name.strip() == name: | ||||
return param | return param | ||||
raise ValueError(name) | raise ValueError(name) | ||||
@@ -149,8 +149,9 @@ class Template(Node): | |||||
else: | else: | ||||
int_keys = set() | int_keys = set() | ||||
for param in self.params: | for param in self.params: | ||||
if re.match(r"[1-9][0-9]*$", param.name.strip()): | |||||
int_keys.add(int(unicode(param.name))) | |||||
if not param.showkey: | |||||
if re.match(r"[1-9][0-9]*$", param.name.strip()): | |||||
int_keys.add(int(unicode(param.name))) | |||||
expected = min(set(range(1, len(int_keys) + 2)) - int_keys) | expected = min(set(range(1, len(int_keys) + 2)) - int_keys) | ||||
if expected == int_name: | if expected == int_name: | ||||
showkey = False | showkey = False | ||||
@@ -170,7 +171,7 @@ class Template(Node): | |||||
self.params.append(param) | self.params.append(param) | ||||
return param | return param | ||||
def remove(self, name, keep_field=False, force_no_field=False): | |||||
def remove(self, name, keep_field=False, force_no_field=False): # KEEP FIRST FIELD, REMOVE ALL AFTER | |||||
name = name.strip() if isinstance(name, basestring) else unicode(name) | name = name.strip() if isinstance(name, basestring) else unicode(name) | ||||
for i, param in enumerate(self.params): | for i, param in enumerate(self.params): | ||||
if param.name.strip() == name: | if param.name.strip() == name: | ||||
@@ -20,8 +20,6 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
import re | |||||
from . import tokens | from . import tokens | ||||
from ..nodes import Heading, HTMLEntity, Tag, Template, Text | from ..nodes import Heading, HTMLEntity, Tag, Template, Text | ||||
from ..nodes.extras import Attribute, Parameter | from ..nodes.extras import Attribute, Parameter | ||||
@@ -49,42 +47,39 @@ class Builder(object): | |||||
def _write(self, item): | def _write(self, item): | ||||
self._stacks[-1].append(item) | self._stacks[-1].append(item) | ||||
def _handle_parameter(self, key): | |||||
def _handle_parameter(self, default): | |||||
key = None | |||||
showkey = False | showkey = False | ||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop(0) | |||||
token = self._tokens.pop() | |||||
if isinstance(token, tokens.TemplateParamEquals): | if isinstance(token, tokens.TemplateParamEquals): | ||||
key = self._pop() | key = self._pop() | ||||
showkey = True | showkey = True | ||||
self._push() | self._push() | ||||
elif isinstance(token, (tokens.TemplateParamSeparator, | elif isinstance(token, (tokens.TemplateParamSeparator, | ||||
tokens.TemplateClose)): | tokens.TemplateClose)): | ||||
self._tokens.insert(0, token) | |||||
self._tokens.append(token) | |||||
value = self._pop() | value = self._pop() | ||||
if not key: | |||||
key = self._wrap([Text(unicode(default))]) | |||||
return Parameter(key, value, showkey) | return Parameter(key, value, showkey) | ||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
def _handle_template(self): | def _handle_template(self): | ||||
params = [] | params = [] | ||||
int_keys = set() | |||||
int_key_range = {1} | |||||
default = 1 | |||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop(0) | |||||
token = self._tokens.pop() | |||||
if isinstance(token, tokens.TemplateParamSeparator): | if isinstance(token, tokens.TemplateParamSeparator): | ||||
if not params: | if not params: | ||||
name = self._pop() | name = self._pop() | ||||
default = unicode(min(int_key_range - int_keys)) | |||||
param = self._handle_parameter(self._wrap([Text(default)])) | |||||
if re.match(r"[1-9][0-9]*$", param.name.strip()): | |||||
# We try a more restrictive test for integers than | |||||
# try: int(), because "01" as a key will pass through int() | |||||
# correctly but is not a valid integer key in wikicode: | |||||
int_keys.add(int(unicode(param.name))) | |||||
int_key_range.add(len(int_keys) + 1) | |||||
param = self._handle_parameter(default) | |||||
params.append(param) | params.append(param) | ||||
if not param.showkey: | |||||
default += 1 | |||||
elif isinstance(token, tokens.TemplateClose): | elif isinstance(token, tokens.TemplateClose): | ||||
if not params: | if not params: | ||||
name = self._pop() | name = self._pop() | ||||
@@ -93,25 +88,25 @@ class Builder(object): | |||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
def _handle_entity(self): | def _handle_entity(self): | ||||
token = self._tokens.pop(0) | |||||
token = self._tokens.pop() | |||||
if isinstance(token, tokens.HTMLEntityNumeric): | if isinstance(token, tokens.HTMLEntityNumeric): | ||||
token = self._tokens.pop(0) | |||||
token = self._tokens.pop() | |||||
if isinstance(token, tokens.HTMLEntityHex): | if isinstance(token, tokens.HTMLEntityHex): | ||||
text = self._tokens.pop(0) | |||||
self._tokens.pop(0) # Remove HTMLEntityEnd | |||||
text = self._tokens.pop() | |||||
self._tokens.pop() # Remove HTMLEntityEnd | |||||
return HTMLEntity(text.text, named=False, hexadecimal=True, | return HTMLEntity(text.text, named=False, hexadecimal=True, | ||||
hex_char=token.char) | hex_char=token.char) | ||||
self._tokens.pop(0) # Remove HTMLEntityEnd | |||||
self._tokens.pop() # Remove HTMLEntityEnd | |||||
return HTMLEntity(token.text, named=False, hexadecimal=False) | return HTMLEntity(token.text, named=False, hexadecimal=False) | ||||
self._tokens.pop(0) # Remove HTMLEntityEnd | |||||
self._tokens.pop() # Remove HTMLEntityEnd | |||||
return HTMLEntity(token.text, named=True, hexadecimal=False) | return HTMLEntity(token.text, named=True, hexadecimal=False) | ||||
def _handle_heading(self, token): | def _handle_heading(self, token): | ||||
level = token.level | level = token.level | ||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop(0) | |||||
if isinstance(token, tokens.HeadingBlock): | |||||
token = self._tokens.pop() | |||||
if isinstance(token, tokens.HeadingEnd): | |||||
title = self._pop() | title = self._pop() | ||||
return Heading(title, level) | return Heading(title, level) | ||||
else: | else: | ||||
@@ -121,7 +116,7 @@ class Builder(object): | |||||
name, quoted = None, False | name, quoted = None, False | ||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop(0) | |||||
token = self._tokens.pop() | |||||
if isinstance(token, tokens.TagAttrEquals): | if isinstance(token, tokens.TagAttrEquals): | ||||
name = self._pop() | name = self._pop() | ||||
self._push() | self._push() | ||||
@@ -129,7 +124,7 @@ class Builder(object): | |||||
quoted = True | quoted = True | ||||
elif isinstance(token, (tokens.TagAttrStart, | elif isinstance(token, (tokens.TagAttrStart, | ||||
tokens.TagCloseOpen)): | tokens.TagCloseOpen)): | ||||
self._tokens.insert(0, token) | |||||
self._tokens.append(token) | |||||
if name is not None: | if name is not None: | ||||
return Attribute(name, self._pop(), quoted) | return Attribute(name, self._pop(), quoted) | ||||
return Attribute(self._pop(), quoted=quoted) | return Attribute(self._pop(), quoted=quoted) | ||||
@@ -141,7 +136,7 @@ class Builder(object): | |||||
attrs = [] | attrs = [] | ||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop(0) | |||||
token = self._tokens.pop() | |||||
if isinstance(token, tokens.TagAttrStart): | if isinstance(token, tokens.TagAttrStart): | ||||
attrs.append(self._handle_attribute()) | attrs.append(self._handle_attribute()) | ||||
elif isinstance(token, tokens.TagCloseOpen): | elif isinstance(token, tokens.TagCloseOpen): | ||||
@@ -167,15 +162,16 @@ class Builder(object): | |||||
return self._handle_template() | return self._handle_template() | ||||
elif isinstance(token, tokens.HTMLEntityStart): | elif isinstance(token, tokens.HTMLEntityStart): | ||||
return self._handle_entity() | return self._handle_entity() | ||||
elif isinstance(token, tokens.HeadingBlock): | |||||
elif isinstance(token, tokens.HeadingStart): | |||||
return self._handle_heading(token) | return self._handle_heading(token) | ||||
elif isinstance(token, tokens.TagOpenOpen): | elif isinstance(token, tokens.TagOpenOpen): | ||||
return self._handle_tag(token) | return self._handle_tag(token) | ||||
def build(self, tokenlist): | def build(self, tokenlist): | ||||
self._tokens = tokenlist | self._tokens = tokenlist | ||||
self._tokens.reverse() | |||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
node = self._handle_token(self._tokens.pop(0)) | |||||
node = self._handle_token(self._tokens.pop()) | |||||
self._write(node) | self._write(node) | ||||
return self._pop() | return self._pop() |
@@ -20,7 +20,22 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
TEMPLATE = 0b111 | |||||
TEMPLATE_NAME = 0b001 | |||||
TEMPLATE_PARAM_KEY = 0b010 | |||||
TEMPLATE_PARAM_VALUE = 0b100 | |||||
# Local (stack-specific) contexts: | |||||
TEMPLATE = 0b000000111 | |||||
TEMPLATE_NAME = 0b000000001 | |||||
TEMPLATE_PARAM_KEY = 0b000000010 | |||||
TEMPLATE_PARAM_VALUE = 0b000000100 | |||||
HEADING = 0b111111000 | |||||
HEADING_LEVEL_1 = 0b000001000 | |||||
HEADING_LEVEL_2 = 0b000010000 | |||||
HEADING_LEVEL_3 = 0b000100000 | |||||
HEADING_LEVEL_4 = 0b001000000 | |||||
HEADING_LEVEL_5 = 0b010000000 | |||||
HEADING_LEVEL_6 = 0b100000000 | |||||
# Global contexts: | |||||
GL_HEADING = 0b1 |
@@ -21,6 +21,7 @@ | |||||
# SOFTWARE. | # SOFTWARE. | ||||
import htmlentitydefs | import htmlentitydefs | ||||
from math import log | |||||
import re | import re | ||||
import string | import string | ||||
@@ -32,17 +33,19 @@ __all__ = ["Tokenizer"] | |||||
class BadRoute(Exception): | class BadRoute(Exception): | ||||
pass | pass | ||||
class Tokenizer(object): | class Tokenizer(object): | ||||
START = object() | START = object() | ||||
END = object() | END = object() | ||||
SENTINELS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", | |||||
":", "/", "-", END] | |||||
regex = re.compile(r"([{}\[\]<>|=&#*;:/-])", flags=re.IGNORECASE) | |||||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", | |||||
"/", "-", "\n", END] | |||||
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE) | |||||
def __init__(self): | def __init__(self): | ||||
self._text = None | self._text = None | ||||
self._head = 0 | self._head = 0 | ||||
self._stacks = [] | self._stacks = [] | ||||
self._global = 0 | |||||
@property | @property | ||||
def _stack(self): | def _stack(self): | ||||
@@ -76,6 +79,10 @@ class Tokenizer(object): | |||||
self._push_textbuffer() | self._push_textbuffer() | ||||
return self._stacks.pop()[0] | return self._stacks.pop()[0] | ||||
def _fail_route(self): | |||||
self._pop() | |||||
raise BadRoute() | |||||
def _write(self, token): | def _write(self, token): | ||||
self._push_textbuffer() | self._push_textbuffer() | ||||
self._stack.append(token) | self._stack.append(token) | ||||
@@ -84,16 +91,20 @@ class Tokenizer(object): | |||||
self._textbuffer.append(text) | self._textbuffer.append(text) | ||||
def _write_all(self, tokenlist): | def _write_all(self, tokenlist): | ||||
if tokenlist and isinstance(tokenlist[0], tokens.Text): | |||||
self._write_text(tokenlist.pop(0).text) | |||||
self._push_textbuffer() | self._push_textbuffer() | ||||
self._stack.extend(tokenlist) | self._stack.extend(tokenlist) | ||||
def _read(self, delta=0, wrap=False): | |||||
def _read(self, delta=0, wrap=False, strict=False): | |||||
index = self._head + delta | index = self._head + delta | ||||
if index < 0 and (not wrap or abs(index) > len(self._text)): | if index < 0 and (not wrap or abs(index) > len(self._text)): | ||||
return self.START | return self.START | ||||
try: | try: | ||||
return self._text[index] | return self._text[index] | ||||
except IndexError: | except IndexError: | ||||
if strict: | |||||
self._fail_route() | |||||
return self.END | return self.END | ||||
def _parse_template(self): | def _parse_template(self): | ||||
@@ -115,7 +126,7 @@ class Tokenizer(object): | |||||
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] | text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] | ||||
text = "".join([token.text for token in text]) | text = "".join([token.text for token in text]) | ||||
if text.strip() and "\n" in text.strip(): | if text.strip() and "\n" in text.strip(): | ||||
raise BadRoute(self._pop()) | |||||
self._fail_route() | |||||
def _handle_template_param(self): | def _handle_template_param(self): | ||||
if self._context & contexts.TEMPLATE_NAME: | if self._context & contexts.TEMPLATE_NAME: | ||||
@@ -137,44 +148,98 @@ class Tokenizer(object): | |||||
self._head += 1 | self._head += 1 | ||||
return self._pop() | return self._pop() | ||||
def _parse_entity(self): | |||||
def _parse_heading(self): | |||||
self._global |= contexts.GL_HEADING | |||||
reset = self._head | reset = self._head | ||||
self._push() | |||||
self._head += 1 | |||||
best = 1 | |||||
while self._read() == "=": | |||||
best += 1 | |||||
self._head += 1 | |||||
context = contexts.HEADING_LEVEL_1 << min(best - 1, 5) | |||||
try: | try: | ||||
self._write(tokens.HTMLEntityStart()) | |||||
title, level = self._parse(context) | |||||
except BadRoute: | |||||
self._head = reset + best - 1 | |||||
self._write_text("=" * best) | |||||
else: | |||||
self._write(tokens.HeadingStart(level=level)) | |||||
if level < best: | |||||
self._write_text("=" * (best - level)) | |||||
self._write_all(title) | |||||
self._write(tokens.HeadingEnd()) | |||||
finally: | |||||
self._global ^= contexts.GL_HEADING | |||||
def _handle_heading_end(self): | |||||
reset = self._head | |||||
self._head += 1 | |||||
best = 1 | |||||
while self._read() == "=": | |||||
best += 1 | |||||
self._head += 1 | self._head += 1 | ||||
this = self._read() | |||||
if not this or this is self.END: | |||||
raise BadRoute(self._pop()) | |||||
numeric = hexadecimal = False | |||||
if this == "#": | |||||
numeric = True | |||||
self._write(tokens.HTMLEntityNumeric()) | |||||
self._head += 1 | |||||
this = self._read() | |||||
if not this or this is self.END: | |||||
raise BadRoute(self._pop()) | |||||
if this[0].lower() == "x": | |||||
hexadecimal = True | |||||
self._write(tokens.HTMLEntityHex(char=this[0])) | |||||
this = this[1:] | |||||
valid = string.hexdigits if hexadecimal else string.digits | |||||
if not numeric and not hexadecimal: | |||||
valid += string.ascii_letters | |||||
if not all([char in valid for char in this]): | |||||
raise BadRoute(self._pop()) | |||||
current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1 | |||||
level = min(current, min(best, 6)) | |||||
try: | |||||
after, after_level = self._parse(self._context) | |||||
except BadRoute: | |||||
if level < best: | |||||
self._write_text("=" * (best - level)) | |||||
self._head = reset + best - 1 | |||||
return self._pop(), level | |||||
else: | |||||
self._write_text("=" * best) | |||||
self._write_all(after) | |||||
return self._pop(), after_level | |||||
def _really_parse_entity(self): | |||||
self._write(tokens.HTMLEntityStart()) | |||||
self._head += 1 | |||||
this = self._read(strict=True) | |||||
if this == "#": | |||||
numeric = True | |||||
self._write(tokens.HTMLEntityNumeric()) | |||||
self._head += 1 | self._head += 1 | ||||
if self._read() != ";": | |||||
raise BadRoute(self._pop()) | |||||
if numeric: | |||||
test = int(this, 16) if hexadecimal else int(this) | |||||
if test < 1 or test > 0x10FFFF: | |||||
raise BadRoute(self._pop()) | |||||
this = self._read(strict=True) | |||||
if this[0].lower() == "x": | |||||
hexadecimal = True | |||||
self._write(tokens.HTMLEntityHex(char=this[0])) | |||||
this = this[1:] | |||||
if not this: | |||||
self._fail_route() | |||||
else: | else: | ||||
if this not in htmlentitydefs.entitydefs: | |||||
raise BadRoute(self._pop()) | |||||
self._write(tokens.Text(text=this)) | |||||
self._write(tokens.HTMLEntityEnd()) | |||||
hexadecimal = False | |||||
else: | |||||
numeric = hexadecimal = False | |||||
valid = string.hexdigits if hexadecimal else string.digits | |||||
if not numeric and not hexadecimal: | |||||
valid += string.ascii_letters | |||||
if not all([char in valid for char in this]): | |||||
self._fail_route() | |||||
self._head += 1 | |||||
if self._read() != ";": | |||||
self._fail_route() | |||||
if numeric: | |||||
test = int(this, 16) if hexadecimal else int(this) | |||||
if test < 1 or test > 0x10FFFF: | |||||
self._fail_route() | |||||
else: | |||||
if this not in htmlentitydefs.entitydefs: | |||||
self._fail_route() | |||||
self._write(tokens.Text(text=this)) | |||||
self._write(tokens.HTMLEntityEnd()) | |||||
def _parse_entity(self): | |||||
reset = self._head | |||||
self._push() | |||||
try: | |||||
self._really_parse_entity() | |||||
except BadRoute: | except BadRoute: | ||||
self._head = reset | self._head = reset | ||||
self._write_text(self._read()) | self._write_text(self._read()) | ||||
@@ -185,15 +250,15 @@ class Tokenizer(object): | |||||
self._push(context) | self._push(context) | ||||
while True: | while True: | ||||
this = self._read() | this = self._read() | ||||
if this not in self.SENTINELS: | |||||
if this not in self.MARKERS: | |||||
self._write_text(this) | self._write_text(this) | ||||
self._head += 1 | self._head += 1 | ||||
continue | continue | ||||
if this is self.END: | if this is self.END: | ||||
if self._context & contexts.TEMPLATE: | |||||
raise BadRoute(self._pop()) | |||||
if self._context & (contexts.TEMPLATE | contexts.HEADING): | |||||
self._fail_route() | |||||
return self._pop() | return self._pop() | ||||
next = self._read(1) | |||||
prev, next = self._read(-1), self._read(1) | |||||
if this == next == "{": | if this == next == "{": | ||||
self._parse_template() | self._parse_template() | ||||
elif this == "|" and self._context & contexts.TEMPLATE: | elif this == "|" and self._context & contexts.TEMPLATE: | ||||
@@ -202,6 +267,12 @@ class Tokenizer(object): | |||||
self._handle_template_param_value() | self._handle_template_param_value() | ||||
elif this == next == "}" and self._context & contexts.TEMPLATE: | elif this == next == "}" and self._context & contexts.TEMPLATE: | ||||
return self._handle_template_end() | return self._handle_template_end() | ||||
elif (prev == "\n" or prev == self.START) and this == "=" and not self._global & contexts.GL_HEADING: | |||||
self._parse_heading() | |||||
elif this == "=" and self._context & contexts.HEADING: | |||||
return self._handle_heading_end() | |||||
elif this == "\n" and self._context & contexts.HEADING: | |||||
self._fail_route() | |||||
elif this == "&": | elif this == "&": | ||||
self._parse_entity() | self._parse_entity() | ||||
else: | else: | ||||
@@ -29,7 +29,7 @@ class Token(object): | |||||
def __repr__(self): | def __repr__(self): | ||||
args = [] | args = [] | ||||
for key, value in self._kwargs.iteritems(): | for key, value in self._kwargs.iteritems(): | ||||
if len(value) > 100: | |||||
if isinstance(value, basestring) and len(value) > 100: | |||||
args.append(key + "=" + repr(value[:97] + "...")) | args.append(key + "=" + repr(value[:97] + "...")) | ||||
else: | else: | ||||
args.append(key + "=" + repr(value)) | args.append(key + "=" + repr(value)) | ||||
@@ -66,7 +66,8 @@ HTMLEntityNumeric = make("HTMLEntityNumeric") # # | |||||
HTMLEntityHex = make("HTMLEntityHex") # x | HTMLEntityHex = make("HTMLEntityHex") # x | ||||
HTMLEntityEnd = make("HTMLEntityEnd") # ; | HTMLEntityEnd = make("HTMLEntityEnd") # ; | ||||
HeadingBlock = make("HeadingBlock") # =... | |||||
HeadingStart = make("HeadingStart") # =... | |||||
HeadingEnd = make("HeadingEnd") # =... | |||||
TagOpenOpen = make("TagOpenOpen") # < | TagOpenOpen = make("TagOpenOpen") # < | ||||
TagAttrStart = make("TagAttrStart") | TagAttrStart = make("TagAttrStart") | ||||