diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 7240051..5b17351 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -118,7 +118,7 @@ class Template(Node): def get(self, name): name = name.strip() if isinstance(name, basestring) else unicode(name) - for param in self.params: + for param in reversed(self.params): if param.name.strip() == name: return param raise ValueError(name) @@ -149,8 +149,9 @@ class Template(Node): else: int_keys = set() for param in self.params: - if re.match(r"[1-9][0-9]*$", param.name.strip()): - int_keys.add(int(unicode(param.name))) + if not param.showkey: + if re.match(r"[1-9][0-9]*$", param.name.strip()): + int_keys.add(int(unicode(param.name))) expected = min(set(range(1, len(int_keys) + 2)) - int_keys) if expected == int_name: showkey = False @@ -170,7 +171,7 @@ class Template(Node): self.params.append(param) return param - def remove(self, name, keep_field=False, force_no_field=False): + def remove(self, name, keep_field=False, force_no_field=False): # KEEP FIRST FIELD, REMOVE ALL AFTER name = name.strip() if isinstance(name, basestring) else unicode(name) for i, param in enumerate(self.params): if param.name.strip() == name: diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 9edc987..d352321 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -20,8 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import re - from . import tokens from ..nodes import Heading, HTMLEntity, Tag, Template, Text from ..nodes.extras import Attribute, Parameter @@ -49,42 +47,39 @@ class Builder(object): def _write(self, item): self._stacks[-1].append(item) - def _handle_parameter(self, key): + def _handle_parameter(self, default): + key = None showkey = False self._push() while self._tokens: - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.TemplateParamEquals): key = self._pop() showkey = True self._push() elif isinstance(token, (tokens.TemplateParamSeparator, tokens.TemplateClose)): - self._tokens.insert(0, token) + self._tokens.append(token) value = self._pop() + if not key: + key = self._wrap([Text(unicode(default))]) return Parameter(key, value, showkey) else: self._write(self._handle_token(token)) def _handle_template(self): params = [] - int_keys = set() - int_key_range = {1} + default = 1 self._push() while self._tokens: - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.TemplateParamSeparator): if not params: name = self._pop() - default = unicode(min(int_key_range - int_keys)) - param = self._handle_parameter(self._wrap([Text(default)])) - if re.match(r"[1-9][0-9]*$", param.name.strip()): - # We try a more restrictive test for integers than - # try: int(), because "01" as a key will pass through int() - # correctly but is not a valid integer key in wikicode: - int_keys.add(int(unicode(param.name))) - int_key_range.add(len(int_keys) + 1) + param = self._handle_parameter(default) params.append(param) + if not param.showkey: + default += 1 elif isinstance(token, tokens.TemplateClose): if not params: name = self._pop() @@ -93,25 +88,25 @@ class Builder(object): self._write(self._handle_token(token)) def _handle_entity(self): - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.HTMLEntityNumeric): - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.HTMLEntityHex): - text = self._tokens.pop(0) - self._tokens.pop(0) # Remove HTMLEntityEnd + text = self._tokens.pop() + self._tokens.pop() # Remove HTMLEntityEnd return HTMLEntity(text.text, named=False, hexadecimal=True, hex_char=token.char) - self._tokens.pop(0) # Remove HTMLEntityEnd + self._tokens.pop() # Remove HTMLEntityEnd return HTMLEntity(token.text, named=False, hexadecimal=False) - self._tokens.pop(0) # Remove HTMLEntityEnd + self._tokens.pop() # Remove HTMLEntityEnd return HTMLEntity(token.text, named=True, hexadecimal=False) def _handle_heading(self, token): level = token.level self._push() while self._tokens: - token = self._tokens.pop(0) - if isinstance(token, tokens.HeadingBlock): + token = self._tokens.pop() + if isinstance(token, tokens.HeadingEnd): title = self._pop() return Heading(title, level) else: @@ -121,7 +116,7 @@ class Builder(object): name, quoted = None, False self._push() while self._tokens: - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.TagAttrEquals): name = self._pop() self._push() @@ -129,7 +124,7 @@ class Builder(object): quoted = True elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen)): - self._tokens.insert(0, token) + self._tokens.append(token) if name is not None: return Attribute(name, self._pop(), quoted) return Attribute(self._pop(), quoted=quoted) @@ -141,7 +136,7 @@ class Builder(object): attrs = [] self._push() while self._tokens: - token = self._tokens.pop(0) + token = self._tokens.pop() if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute()) elif isinstance(token, tokens.TagCloseOpen): @@ -167,15 +162,16 @@ class Builder(object): return self._handle_template() elif isinstance(token, tokens.HTMLEntityStart): return self._handle_entity() - elif isinstance(token, tokens.HeadingBlock): + elif isinstance(token, tokens.HeadingStart): return self._handle_heading(token) elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) def build(self, tokenlist): self._tokens = tokenlist + self._tokens.reverse() self._push() while self._tokens: - node = self._handle_token(self._tokens.pop(0)) + node = self._handle_token(self._tokens.pop()) self._write(node) return self._pop() diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index f966a1b..6369ee2 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -20,7 +20,22 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -TEMPLATE = 0b111 -TEMPLATE_NAME = 0b001 -TEMPLATE_PARAM_KEY = 0b010 -TEMPLATE_PARAM_VALUE = 0b100 +# Local (stack-specific) contexts: + +TEMPLATE = 0b000000111 +TEMPLATE_NAME = 0b000000001 +TEMPLATE_PARAM_KEY = 0b000000010 +TEMPLATE_PARAM_VALUE = 0b000000100 + +HEADING = 0b111111000 +HEADING_LEVEL_1 = 0b000001000 +HEADING_LEVEL_2 = 0b000010000 +HEADING_LEVEL_3 = 0b000100000 +HEADING_LEVEL_4 = 0b001000000 +HEADING_LEVEL_5 = 0b010000000 +HEADING_LEVEL_6 = 0b100000000 + + +# Global contexts: + +GL_HEADING = 0b1 diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index a2606e9..159ba67 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,6 +21,7 @@ # SOFTWARE. import htmlentitydefs +from math import log import re import string @@ -32,17 +33,19 @@ __all__ = ["Tokenizer"] class BadRoute(Exception): pass + class Tokenizer(object): START = object() END = object() - SENTINELS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", - ":", "/", "-", END] - regex = re.compile(r"([{}\[\]<>|=&#*;:/-])", flags=re.IGNORECASE) + MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", + "/", "-", "\n", END] + regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE) def __init__(self): self._text = None self._head = 0 self._stacks = [] + self._global = 0 @property def _stack(self): @@ -76,6 +79,10 @@ class Tokenizer(object): self._push_textbuffer() return self._stacks.pop()[0] + def _fail_route(self): + self._pop() + raise BadRoute() + def _write(self, token): self._push_textbuffer() self._stack.append(token) @@ -84,16 +91,20 @@ class Tokenizer(object): self._textbuffer.append(text) def _write_all(self, tokenlist): + if tokenlist and isinstance(tokenlist[0], tokens.Text): + self._write_text(tokenlist.pop(0).text) self._push_textbuffer() self._stack.extend(tokenlist) - def _read(self, delta=0, wrap=False): + def _read(self, delta=0, wrap=False, strict=False): index = self._head + delta if index < 0 and (not wrap or abs(index) > len(self._text)): return self.START try: return self._text[index] except IndexError: + if strict: + self._fail_route() return self.END def _parse_template(self): @@ -115,7 +126,7 @@ class Tokenizer(object): text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] text = "".join([token.text for token in text]) if text.strip() and "\n" in text.strip(): - raise BadRoute(self._pop()) + self._fail_route() def _handle_template_param(self): if self._context & contexts.TEMPLATE_NAME: @@ -137,44 +148,98 @@ class Tokenizer(object): self._head += 1 return self._pop() - def _parse_entity(self): + def _parse_heading(self): + self._global |= contexts.GL_HEADING reset = self._head - self._push() + self._head += 1 + best = 1 + while self._read() == "=": + best += 1 + self._head += 1 + context = contexts.HEADING_LEVEL_1 << min(best - 1, 5) + try: - self._write(tokens.HTMLEntityStart()) + title, level = self._parse(context) + except BadRoute: + self._head = reset + best - 1 + self._write_text("=" * best) + else: + self._write(tokens.HeadingStart(level=level)) + if level < best: + self._write_text("=" * (best - level)) + self._write_all(title) + self._write(tokens.HeadingEnd()) + finally: + self._global ^= contexts.GL_HEADING + + def _handle_heading_end(self): + reset = self._head + self._head += 1 + best = 1 + while self._read() == "=": + best += 1 self._head += 1 - this = self._read() - if not this or this is self.END: - raise BadRoute(self._pop()) - numeric = hexadecimal = False - if this == "#": - numeric = True - self._write(tokens.HTMLEntityNumeric()) - self._head += 1 - this = self._read() - if not this or this is self.END: - raise BadRoute(self._pop()) - if this[0].lower() == "x": - hexadecimal = True - self._write(tokens.HTMLEntityHex(char=this[0])) - this = this[1:] - valid = string.hexdigits if hexadecimal else string.digits - if not numeric and not hexadecimal: - valid += string.ascii_letters - if not all([char in valid for char in this]): - raise BadRoute(self._pop()) + current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1 + level = min(current, min(best, 6)) + + try: + after, after_level = self._parse(self._context) + except BadRoute: + if level < best: + self._write_text("=" * (best - level)) + self._head = reset + best - 1 + return self._pop(), level + else: + self._write_text("=" * best) + self._write_all(after) + return self._pop(), after_level + + def _really_parse_entity(self): + self._write(tokens.HTMLEntityStart()) + self._head += 1 + + this = self._read(strict=True) + if this == "#": + numeric = True + self._write(tokens.HTMLEntityNumeric()) self._head += 1 - if self._read() != ";": - raise BadRoute(self._pop()) - if numeric: - test = int(this, 16) if hexadecimal else int(this) - if test < 1 or test > 0x10FFFF: - raise BadRoute(self._pop()) + this = self._read(strict=True) + if this[0].lower() == "x": + hexadecimal = True + self._write(tokens.HTMLEntityHex(char=this[0])) + this = this[1:] + if not this: + self._fail_route() else: - if this not in htmlentitydefs.entitydefs: - raise BadRoute(self._pop()) - self._write(tokens.Text(text=this)) - self._write(tokens.HTMLEntityEnd()) + hexadecimal = False + else: + numeric = hexadecimal = False + + valid = string.hexdigits if hexadecimal else string.digits + if not numeric and not hexadecimal: + valid += string.ascii_letters + if not all([char in valid for char in this]): + self._fail_route() + + self._head += 1 + if self._read() != ";": + self._fail_route() + if numeric: + test = int(this, 16) if hexadecimal else int(this) + if test < 1 or test > 0x10FFFF: + self._fail_route() + else: + if this not in htmlentitydefs.entitydefs: + self._fail_route() + + self._write(tokens.Text(text=this)) + self._write(tokens.HTMLEntityEnd()) + + def _parse_entity(self): + reset = self._head + self._push() + try: + self._really_parse_entity() except BadRoute: self._head = reset self._write_text(self._read()) @@ -185,15 +250,15 @@ class Tokenizer(object): self._push(context) while True: this = self._read() - if this not in self.SENTINELS: + if this not in self.MARKERS: self._write_text(this) self._head += 1 continue if this is self.END: - if self._context & contexts.TEMPLATE: - raise BadRoute(self._pop()) + if self._context & (contexts.TEMPLATE | contexts.HEADING): + self._fail_route() return self._pop() - next = self._read(1) + prev, next = self._read(-1), self._read(1) if this == next == "{": self._parse_template() elif this == "|" and self._context & contexts.TEMPLATE: @@ -202,6 +267,12 @@ class Tokenizer(object): self._handle_template_param_value() elif this == next == "}" and self._context & contexts.TEMPLATE: return self._handle_template_end() + elif (prev == "\n" or prev == self.START) and this == "=" and not self._global & contexts.GL_HEADING: + self._parse_heading() + elif this == "=" and self._context & contexts.HEADING: + return self._handle_heading_end() + elif this == "\n" and self._context & contexts.HEADING: + self._fail_route() elif this == "&": self._parse_entity() else: diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 88881c7..3cb73c9 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -29,7 +29,7 @@ class Token(object): def __repr__(self): args = [] for key, value in self._kwargs.iteritems(): - if len(value) > 100: + if isinstance(value, basestring) and len(value) > 100: args.append(key + "=" + repr(value[:97] + "...")) else: args.append(key + "=" + repr(value)) @@ -66,7 +66,8 @@ HTMLEntityNumeric = make("HTMLEntityNumeric") # # HTMLEntityHex = make("HTMLEntityHex") # x HTMLEntityEnd = make("HTMLEntityEnd") # ; -HeadingBlock = make("HeadingBlock") # =... +HeadingStart = make("HeadingStart") # =... +HeadingEnd = make("HeadingEnd") # =... TagOpenOpen = make("TagOpenOpen") # < TagAttrStart = make("TagAttrStart")