Browse Source

Don't get stuck in tags with unclosed quoted attributes (fixes #190).

tags/v0.5.1
Ben Kurtovic 6 years ago
parent
commit
86c805d59b
9 changed files with 71 additions and 25 deletions
  1. +3
    -1
      CHANGELOG
  2. +1
    -1
      LICENSE
  3. +4
    -0
      docs/changelog.rst
  4. +1
    -1
      docs/conf.py
  5. +2
    -2
      mwparserfromhell/__init__.py
  6. +12
    -4
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  7. +19
    -9
      mwparserfromhell/parser/ctokenizer/tok_support.c
  8. +2
    -1
      mwparserfromhell/parser/ctokenizer/tok_support.h
  9. +27
    -6
      mwparserfromhell/parser/tokenizer.py

+ 3
- 1
CHANGELOG View File

@@ -1,7 +1,9 @@
v0.6 (unreleased):

- Improved behavior when adding parameters to templates (via Template.add())
with poorly formatted whitespace conventions.
with poorly formatted whitespace conventions. (#185)
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
quoted attributes. (#190)

v0.5 (released June 23, 2017):



+ 1
- 1
LICENSE View File

@@ -1,4 +1,4 @@
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal


+ 4
- 0
docs/changelog.rst View File

@@ -9,6 +9,10 @@ Unreleased

- Improved behavior when adding parameters to templates (via
:meth:`.Template.add`) with poorly formatted whitespace conventions.
(`#185 <https://github.com/earwig/mwparserfromhell/issues/185>`_)
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
quoted attributes.
(`#190 <https://github.com/earwig/mwparserfromhell/issues/190>`_)

v0.5
----


+ 1
- 1
docs/conf.py View File

@@ -42,7 +42,7 @@ master_doc = 'index'

# General information about the project.
project = u'mwparserfromhell'
copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic'
copyright = u'2012–2018 Ben Kurtovic'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the


+ 2
- 2
mwparserfromhell/__init__.py View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode.
"""

__author__ = "Ben Kurtovic"
__copyright__ = "Copyright (C) 2012-2017 Ben Kurtovic"
__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic"
__license__ = "MIT License"
__version__ = "0.6.dev0"
__email__ = "ben.kurtovic@gmail.com"


+ 12
- 4
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -1,5 +1,5 @@
/*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
*/
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
{
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS
#define NOT_A_LINK \
if (!brackets && self->topstack->context & LC_DLTERM) \
return Tokenizer_handle_dl_term(self); \
@@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
PyObject *link, *kwargs;
Textbuffer *extra;

if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
if (self->topstack->context & AGG_NO_EXT_LINKS ||
!(Tokenizer_CAN_RECURSE(self))) {
NOT_A_LINK;
}
extra = Textbuffer_new(&self->text);
@@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data(
else if (data->context & TAG_NOTE_SPACE) {
if (data->context & TAG_QUOTED) {
data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset - 1; // Will be auto-incremented
@@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data(
data->context |= TAG_QUOTED;
data->quoter = chunk;
data->reset = self->head;
if (Tokenizer_push(self, self->topstack->context))
if (Tokenizer_check_route(self, self->topstack->context) < 0) {
RESET_ROUTE();
data->context = TAG_ATTR_VALUE;
self->head--;
}
else if (Tokenizer_push(self, self->topstack->context))
return -1;
return 0;
}
@@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset;
@@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset;


+ 19
- 9
mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -1,5 +1,5 @@
/*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -147,6 +147,22 @@ static int compare_nodes(
}

/*
Remember that the current route (head + context at push) is invalid.

This will be noticed when calling Tokenizer_check_route with the same head
and context, and the route will be failed immediately.
*/
void Tokenizer_memoize_bad_route(Tokenizer *self)
{
route_tree_node *node = malloc(sizeof(route_tree_node));
if (node) {
node->id = self->topstack->ident;
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
free(node);
}
}

/*
Fail the current tokenization route. Discards the current
stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the
ident of the failed stack so future parsing attempts down this route can be
@@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
uint64_t context = self->topstack->context;
PyObject* stack;

route_tree_node *node = malloc(sizeof(route_tree_node));
if (node) {
node->id = self->topstack->ident;
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
free(node);
}

Tokenizer_memoize_bad_route(self);
stack = Tokenizer_pop(self);
Py_XDECREF(stack);
FAIL_ROUTE(context);
@@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
/*
Check if pushing a new route here with the given context would definitely
fail, based on a previous call to Tokenizer_fail_route() with the same
stack.
stack. (Or any other call to Tokenizer_memoize_bad_route().)

Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the
latter case.


+ 2
- 1
mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -1,5 +1,5 @@
/*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*);
void Tokenizer_delete_top_of_stack(Tokenizer*);
PyObject* Tokenizer_pop(Tokenizer*);
PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
void Tokenizer_memoize_bad_route(Tokenizer*);
void* Tokenizer_fail_route(Tokenizer*);
int Tokenizer_check_route(Tokenizer*, uint64_t);
void Tokenizer_free_bad_route_tree(Tokenizer*);


+ 27
- 6
mwparserfromhell/parser/tokenizer.py View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -144,6 +144,14 @@ class Tokenizer(object):
"""Return whether or not our max recursion depth has been exceeded."""
return self._depth < self.MAX_DEPTH

def _memoize_bad_route(self):
"""Remember that the current route (head + context at push) is invalid.

This will be noticed when calling _push with the same head and context,
and the route will be failed immediately.
"""
self._bad_routes.add(self._stack_ident)

def _fail_route(self):
"""Fail the current tokenization route.

@@ -151,7 +159,7 @@ class Tokenizer(object):
:exc:`.BadRoute`.
"""
context = self._context
self._bad_routes.add(self._stack_ident)
self._memoize_bad_route()
self._pop()
raise BadRoute(context)

@@ -506,12 +514,16 @@ class Tokenizer(object):

def _parse_external_link(self, brackets):
"""Parse an external link at the head of the wikicode string."""
if self._context & contexts.NO_EXT_LINKS or not self._can_recurse():
if not brackets and self._context & contexts.DL_TERM:
self._handle_dl_term()
else:
self._emit_text(self._read())
return

reset = self._head
self._head += 1
try:
bad_context = self._context & contexts.NO_EXT_LINKS
if bad_context or not self._can_recurse():
raise BadRoute()
link, extra, delta = self._really_parse_external_link(brackets)
except BadRoute:
self._head = reset
@@ -719,6 +731,7 @@ class Tokenizer(object):
elif data.context & data.CX_NOTE_SPACE:
if data.context & data.CX_QUOTED:
data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop()
self._head = data.reset - 1 # Will be auto-incremented
return # Break early
@@ -743,7 +756,13 @@ class Tokenizer(object):
data.context |= data.CX_QUOTED
data.quoter = chunk
data.reset = self._head
self._push(self._context)
try:
self._push(self._context)
except BadRoute:
# Already failed to parse this as a quoted string
data.context = data.CX_ATTR_VALUE
self._head -= 1
return
continue
elif data.context & data.CX_QUOTED:
if chunk == data.quoter and not escaped:
@@ -845,6 +864,7 @@ class Tokenizer(object):
if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop()
self._head = data.reset
continue
@@ -1084,6 +1104,7 @@ class Tokenizer(object):
if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop()
self._head = data.reset
continue


Loading…
Cancel
Save