Browse Source

Fully fix parsing templates with blank names, I hope (#111)

tags/v0.4.1
Ben Kurtovic 8 years ago
parent
commit
f16c7e25ca
7 changed files with 85 additions and 53 deletions
  1. +3
    -0
      CHANGELOG
  2. +5
    -0
      docs/changelog.rst
  3. +10
    -8
      mwparserfromhell/parser/contexts.py
  4. +32
    -20
      mwparserfromhell/parser/tokenizer.c
  5. +11
    -10
      mwparserfromhell/parser/tokenizer.h
  6. +22
    -13
      mwparserfromhell/parser/tokenizer.py
  7. +2
    -2
      tests/tokenizer/templates.mwtest

+ 3
- 0
CHANGELOG View File

@@ -8,6 +8,9 @@ v0.4.1 (unreleased):
includes when denoting tags, but not comments.
- Fixed the behavior of preserve_spacing in Template.add() and keep_field in
Template.remove() on parameters with hidden keys.
- Fixed parser bugs involving:
- templates with completely blank names;
- templates with newlines and comments.
- Fixed some bugs in the release scripts.

v0.4 (released May 23, 2015):


+ 5
- 0
docs/changelog.rst View File

@@ -15,6 +15,11 @@ Unreleased
This includes when denoting tags, but not comments.
- Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and
*keep_field* in :func:`~.Template.remove` on parameters with hidden keys.
- Fixed parser bugs involving:

- templates with completely blank names;
- templates with newlines and comments.

- Fixed some bugs in the release scripts.

v0.4


+ 10
- 8
mwparserfromhell/parser/contexts.py View File

@@ -89,6 +89,7 @@ Local (stack-specific) contexts:
* :const:`FAIL_ON_LBRACE`
* :const:`FAIL_ON_RBRACE`
* :const:`FAIL_ON_EQUALS`
* :const:`HAS_TEMPLATE`

* :const:`TABLE`

@@ -161,15 +162,16 @@ FAIL_NEXT = 1 << 26
FAIL_ON_LBRACE = 1 << 27
FAIL_ON_RBRACE = 1 << 28
FAIL_ON_EQUALS = 1 << 29
HAS_TEMPLATE = 1 << 30
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS)
TABLE_OPEN = 1 << 30
TABLE_CELL_OPEN = 1 << 31
TABLE_CELL_STYLE = 1 << 32
TABLE_ROW_OPEN = 1 << 33
TABLE_TD_LINE = 1 << 34
TABLE_TH_LINE = 1 << 35
FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE)
TABLE_OPEN = 1 << 31
TABLE_CELL_OPEN = 1 << 32
TABLE_CELL_STYLE = 1 << 33
TABLE_ROW_OPEN = 1 << 34
TABLE_TD_LINE = 1 << 35
TABLE_TH_LINE = 1 << 36
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
TABLE_TD_LINE + TABLE_TH_LINE)


+ 32
- 20
mwparserfromhell/parser/tokenizer.c View File

@@ -582,12 +582,16 @@ static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
/*
Parse a template at the head of the wikicode string.
*/
static int Tokenizer_parse_template(Tokenizer* self)
static int Tokenizer_parse_template(Tokenizer* self, int has_content)
{
PyObject *template;
Py_ssize_t reset = self->head;
uint64_t context = LC_TEMPLATE_NAME;

template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1);
if (has_content)
context |= LC_HAS_TEMPLATE;

template = Tokenizer_parse(self, context, 1);
if (BAD_ROUTE) {
self->head = reset;
return 0;
@@ -643,6 +647,7 @@ static int Tokenizer_parse_argument(Tokenizer* self)
static int Tokenizer_parse_template_or_argument(Tokenizer* self)
{
unsigned int braces = 2, i;
int has_content = 0;
PyObject *tokenlist;

self->head += 2;
@@ -659,7 +664,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
return 0;
}
if (braces == 2) {
if (Tokenizer_parse_template(self))
if (Tokenizer_parse_template(self, has_content))
return -1;
if (BAD_ROUTE) {
RESET_ROUTE();
@@ -673,7 +678,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
return -1;
if (BAD_ROUTE) {
RESET_ROUTE();
if (Tokenizer_parse_template(self))
if (Tokenizer_parse_template(self, has_content))
return -1;
if (BAD_ROUTE) {
char text[MAX_BRACES + 1];
@@ -689,8 +694,10 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
}
else
braces -= 3;
if (braces)
if (braces) {
has_content = 1;
self->head++;
}
}
tokenlist = Tokenizer_pop(self);
if (!tokenlist)
@@ -712,8 +719,13 @@ static int Tokenizer_handle_template_param(Tokenizer* self)
{
PyObject *stack;

if (self->topstack->context & LC_TEMPLATE_NAME)
if (self->topstack->context & LC_TEMPLATE_NAME) {
if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) {
Tokenizer_fail_route(self);
return -1;
}
self->topstack->context ^= LC_TEMPLATE_NAME;
}
else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE)
self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE;
if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
@@ -764,7 +776,11 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self)
{
PyObject* stack;

if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
if (self->topstack->context & LC_TEMPLATE_NAME) {
if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE)))
return Tokenizer_fail_route(self);
}
else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
stack = Tokenizer_pop_keeping_context(self);
if (!stack)
return NULL;
@@ -2885,30 +2901,26 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
if (context & LC_TAG_CLOSE)
return (data == '<') ? -1 : 0;
if (context & LC_TEMPLATE_NAME) {
if (data == '{' || data == '}' || data == '[') {
if (data == '{') {
self->topstack->context |= LC_HAS_TEMPLATE | LC_FAIL_NEXT;
return 0;
}
if (data == '}' || (data == '<' && Tokenizer_READ(self, 1) == '!')) {
self->topstack->context |= LC_FAIL_NEXT;
return 0;
}
if (data == ']' || data == '>' || (data == '<' &&
Tokenizer_READ(self, 1) != '!')) {
if (data == '[' || data == ']' || data == '<' || data == '>') {
return -1;
}
if (data == '|')
return 0;
if (context & LC_HAS_TEXT) {
if (context & LC_FAIL_ON_TEXT) {
if (!Py_UNICODE_ISSPACE(data)) {
if (data == '<' && Tokenizer_READ(self, 1) == '!') {
self->topstack->context |= LC_FAIL_NEXT;
return 0;
}
if (!Py_UNICODE_ISSPACE(data))
return -1;
}
}
else {
if (data == '\n')
self->topstack->context |= LC_FAIL_ON_TEXT;
}
else if (data == '\n')
self->topstack->context |= LC_FAIL_ON_TEXT;
}
else if (!Py_UNICODE_ISSPACE(data))
self->topstack->context |= LC_HAS_TEXT;


+ 11
- 10
mwparserfromhell/parser/tokenizer.h View File

@@ -150,22 +150,23 @@ static PyObject* TagCloseClose;

#define LC_DLTERM 0x0000000000800000

#define LC_SAFETY_CHECK 0x000000003F000000
#define LC_SAFETY_CHECK 0x000000007F000000
#define LC_HAS_TEXT 0x0000000001000000
#define LC_FAIL_ON_TEXT 0x0000000002000000
#define LC_FAIL_NEXT 0x0000000004000000
#define LC_FAIL_ON_LBRACE 0x0000000008000000
#define LC_FAIL_ON_RBRACE 0x0000000010000000
#define LC_FAIL_ON_EQUALS 0x0000000020000000

#define LC_TABLE 0x0000000FC0000000
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000
#define LC_TABLE_OPEN 0x0000000040000000
#define LC_TABLE_CELL_OPEN 0x0000000080000000
#define LC_TABLE_CELL_STYLE 0x0000000100000000
#define LC_TABLE_ROW_OPEN 0x0000000200000000
#define LC_TABLE_TD_LINE 0x0000000400000000
#define LC_TABLE_TH_LINE 0x0000000800000000
#define LC_HAS_TEMPLATE 0x0000000040000000

#define LC_TABLE 0x0000001F80000000
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000
#define LC_TABLE_OPEN 0x0000000080000000
#define LC_TABLE_CELL_OPEN 0x0000000100000000
#define LC_TABLE_CELL_STYLE 0x0000000200000000
#define LC_TABLE_ROW_OPEN 0x0000000400000000
#define LC_TABLE_TD_LINE 0x0000000800000000
#define LC_TABLE_TH_LINE 0x0000001000000000

/* Global contexts: */



+ 22
- 13
mwparserfromhell/parser/tokenizer.py View File

@@ -192,11 +192,14 @@ class Tokenizer(object):
self._fail_route()
return self.END

def _parse_template(self):
def _parse_template(self, has_content):
"""Parse a template at the head of the wikicode string."""
reset = self._head
context = contexts.TEMPLATE_NAME
if has_content:
context |= contexts.HAS_TEMPLATE
try:
template = self._parse(contexts.TEMPLATE_NAME)
template = self._parse(context)
except BadRoute:
self._head = reset
raise
@@ -223,6 +226,7 @@ class Tokenizer(object):
while self._read() == "{":
self._head += 1
braces += 1
has_content = False
self._push()

while braces:
@@ -230,7 +234,7 @@ class Tokenizer(object):
return self._emit_text_then_stack("{")
if braces == 2:
try:
self._parse_template()
self._parse_template(has_content)
except BadRoute:
return self._emit_text_then_stack("{{")
break
@@ -239,11 +243,12 @@ class Tokenizer(object):
braces -= 3
except BadRoute:
try:
self._parse_template()
self._parse_template(has_content)
braces -= 2
except BadRoute:
return self._emit_text_then_stack("{" * braces)
if braces:
has_content = True
self._head += 1

self._emit_all(self._pop())
@@ -253,6 +258,8 @@ class Tokenizer(object):
def _handle_template_param(self):
"""Handle a template parameter at the head of the string."""
if self._context & contexts.TEMPLATE_NAME:
if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
self._fail_route()
self._context ^= contexts.TEMPLATE_NAME
elif self._context & contexts.TEMPLATE_PARAM_VALUE:
self._context ^= contexts.TEMPLATE_PARAM_VALUE
@@ -271,7 +278,10 @@ class Tokenizer(object):

def _handle_template_end(self):
"""Handle the end of a template at the head of the string."""
if self._context & contexts.TEMPLATE_PARAM_KEY:
if self._context & contexts.TEMPLATE_NAME:
if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
self._fail_route()
elif self._context & contexts.TEMPLATE_PARAM_KEY:
self._emit_all(self._pop(keep_context=True))
self._head += 1
return self._pop()
@@ -1183,23 +1193,22 @@ class Tokenizer(object):
elif context & contexts.EXT_LINK_TITLE:
return this != "\n"
elif context & contexts.TEMPLATE_NAME:
if this == "{" or this == "}" or this == "[":
if this == "{":
self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT
return True
if this == "}" or (this == "<" and self._read(1) == "!"):
self._context |= contexts.FAIL_NEXT
return True
if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"):
if this == "[" or this == "]" or this == "<" or this == ">":
return False
if this == "|":
return True
if context & contexts.HAS_TEXT:
if context & contexts.FAIL_ON_TEXT:
if this is self.END or not this.isspace():
if this == "<" and self._read(1) == "!":
self._context |= contexts.FAIL_NEXT
return True
return False
else:
if this == "\n":
self._context |= contexts.FAIL_ON_TEXT
elif this == "\n":
self._context |= contexts.FAIL_ON_TEXT
elif this is self.END or not this.isspace():
self._context |= contexts.HAS_TEXT
return True


+ 2
- 2
tests/tokenizer/templates.mwtest View File

@@ -686,5 +686,5 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{

name: recursion_opens_and_closes
label: test potentially dangerous recursion: template openings and closings
input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}"
output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")]
input: "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}"
output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")]

Loading…
Cancel
Save