Browse Source

Don't get stuck in tags with unclosed quoted attributes (fixes #190).

tags/v0.5.1
Ben Kurtovic 6 years ago
parent
commit
86c805d59b
9 changed files with 71 additions and 25 deletions
  1. +3
    -1
      CHANGELOG
  2. +1
    -1
      LICENSE
  3. +4
    -0
      docs/changelog.rst
  4. +1
    -1
      docs/conf.py
  5. +2
    -2
      mwparserfromhell/__init__.py
  6. +12
    -4
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  7. +19
    -9
      mwparserfromhell/parser/ctokenizer/tok_support.c
  8. +2
    -1
      mwparserfromhell/parser/ctokenizer/tok_support.h
  9. +27
    -6
      mwparserfromhell/parser/tokenizer.py

+ 3
- 1
CHANGELOG View File

@@ -1,7 +1,9 @@
v0.6 (unreleased): v0.6 (unreleased):


- Improved behavior when adding parameters to templates (via Template.add()) - Improved behavior when adding parameters to templates (via Template.add())
with poorly formatted whitespace conventions.
with poorly formatted whitespace conventions. (#185)
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
quoted attributes. (#190)


v0.5 (released June 23, 2017): v0.5 (released June 23, 2017):




+ 1
- 1
LICENSE View File

@@ -1,4 +1,4 @@
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>


Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal


+ 4
- 0
docs/changelog.rst View File

@@ -9,6 +9,10 @@ Unreleased


- Improved behavior when adding parameters to templates (via - Improved behavior when adding parameters to templates (via
:meth:`.Template.add`) with poorly formatted whitespace conventions. :meth:`.Template.add`) with poorly formatted whitespace conventions.
(`#185 <https://github.com/earwig/mwparserfromhell/issues/185>`_)
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
quoted attributes.
(`#190 <https://github.com/earwig/mwparserfromhell/issues/190>`_)


v0.5 v0.5
---- ----


+ 1
- 1
docs/conf.py View File

@@ -42,7 +42,7 @@ master_doc = 'index'


# General information about the project. # General information about the project.
project = u'mwparserfromhell' project = u'mwparserfromhell'
copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic'
copyright = u'2012–2018 Ben Kurtovic'


# The version info for the project you're documenting, acts as replacement for # The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the # |version| and |release|, also used in various other places throughout the


+ 2
- 2
mwparserfromhell/__init__.py View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode.
""" """


__author__ = "Ben Kurtovic" __author__ = "Ben Kurtovic"
__copyright__ = "Copyright (C) 2012-2017 Ben Kurtovic"
__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic"
__license__ = "MIT License" __license__ = "MIT License"
__version__ = "0.6.dev0" __version__ = "0.6.dev0"
__email__ = "ben.kurtovic@gmail.com" __email__ = "ben.kurtovic@gmail.com"


+ 12
- 4
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -1,5 +1,5 @@
/* /*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>


Permission is hereby granted, free of charge, to any person obtaining a copy of Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in this software and associated documentation files (the "Software"), to deal in
@@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
*/ */
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
{ {
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS
#define NOT_A_LINK \ #define NOT_A_LINK \
if (!brackets && self->topstack->context & LC_DLTERM) \ if (!brackets && self->topstack->context & LC_DLTERM) \
return Tokenizer_handle_dl_term(self); \ return Tokenizer_handle_dl_term(self); \
@@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
PyObject *link, *kwargs; PyObject *link, *kwargs;
Textbuffer *extra; Textbuffer *extra;


if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
if (self->topstack->context & AGG_NO_EXT_LINKS ||
!(Tokenizer_CAN_RECURSE(self))) {
NOT_A_LINK; NOT_A_LINK;
} }
extra = Textbuffer_new(&self->text); extra = Textbuffer_new(&self->text);
@@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data(
else if (data->context & TAG_NOTE_SPACE) { else if (data->context & TAG_NOTE_SPACE) {
if (data->context & TAG_QUOTED) { if (data->context & TAG_QUOTED) {
data->context = TAG_ATTR_VALUE; data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self); trash = Tokenizer_pop(self);
Py_XDECREF(trash); Py_XDECREF(trash);
self->head = data->reset - 1; // Will be auto-incremented self->head = data->reset - 1; // Will be auto-incremented
@@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data(
data->context |= TAG_QUOTED; data->context |= TAG_QUOTED;
data->quoter = chunk; data->quoter = chunk;
data->reset = self->head; data->reset = self->head;
if (Tokenizer_push(self, self->topstack->context))
if (Tokenizer_check_route(self, self->topstack->context) < 0) {
RESET_ROUTE();
data->context = TAG_ATTR_VALUE;
self->head--;
}
else if (Tokenizer_push(self, self->topstack->context))
return -1; return -1;
return 0; return 0;
} }
@@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
if (data->context & TAG_QUOTED) { if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die // Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE; data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self); trash = Tokenizer_pop(self);
Py_XDECREF(trash); Py_XDECREF(trash);
self->head = data->reset; self->head = data->reset;
@@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token
if (data->context & TAG_QUOTED) { if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die // Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE; data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self); trash = Tokenizer_pop(self);
Py_XDECREF(trash); Py_XDECREF(trash);
self->head = data->reset; self->head = data->reset;


+ 19
- 9
mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -1,5 +1,5 @@
/* /*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>


Permission is hereby granted, free of charge, to any person obtaining a copy of Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in this software and associated documentation files (the "Software"), to deal in
@@ -147,6 +147,22 @@ static int compare_nodes(
} }


/* /*
Remember that the current route (head + context at push) is invalid.

This will be noticed when calling Tokenizer_check_route with the same head
and context, and the route will be failed immediately.
*/
void Tokenizer_memoize_bad_route(Tokenizer *self)
{
route_tree_node *node = malloc(sizeof(route_tree_node));
if (node) {
node->id = self->topstack->ident;
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
free(node);
}
}

/*
Fail the current tokenization route. Discards the current Fail the current tokenization route. Discards the current
stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the
ident of the failed stack so future parsing attempts down this route can be ident of the failed stack so future parsing attempts down this route can be
@@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
uint64_t context = self->topstack->context; uint64_t context = self->topstack->context;
PyObject* stack; PyObject* stack;


route_tree_node *node = malloc(sizeof(route_tree_node));
if (node) {
node->id = self->topstack->ident;
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
free(node);
}

Tokenizer_memoize_bad_route(self);
stack = Tokenizer_pop(self); stack = Tokenizer_pop(self);
Py_XDECREF(stack); Py_XDECREF(stack);
FAIL_ROUTE(context); FAIL_ROUTE(context);
@@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
/* /*
Check if pushing a new route here with the given context would definitely Check if pushing a new route here with the given context would definitely
fail, based on a previous call to Tokenizer_fail_route() with the same fail, based on a previous call to Tokenizer_fail_route() with the same
stack.
stack. (Or any other call to Tokenizer_memoize_bad_route().)


Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the
latter case. latter case.


+ 2
- 1
mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -1,5 +1,5 @@
/* /*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>


Permission is hereby granted, free of charge, to any person obtaining a copy of Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in this software and associated documentation files (the "Software"), to deal in
@@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*);
void Tokenizer_delete_top_of_stack(Tokenizer*); void Tokenizer_delete_top_of_stack(Tokenizer*);
PyObject* Tokenizer_pop(Tokenizer*); PyObject* Tokenizer_pop(Tokenizer*);
PyObject* Tokenizer_pop_keeping_context(Tokenizer*); PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
void Tokenizer_memoize_bad_route(Tokenizer*);
void* Tokenizer_fail_route(Tokenizer*); void* Tokenizer_fail_route(Tokenizer*);
int Tokenizer_check_route(Tokenizer*, uint64_t); int Tokenizer_check_route(Tokenizer*, uint64_t);
void Tokenizer_free_bad_route_tree(Tokenizer*); void Tokenizer_free_bad_route_tree(Tokenizer*);


+ 27
- 6
mwparserfromhell/parser/tokenizer.py View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -144,6 +144,14 @@ class Tokenizer(object):
"""Return whether or not our max recursion depth has been exceeded.""" """Return whether or not our max recursion depth has been exceeded."""
return self._depth < self.MAX_DEPTH return self._depth < self.MAX_DEPTH


def _memoize_bad_route(self):
"""Remember that the current route (head + context at push) is invalid.

This will be noticed when calling _push with the same head and context,
and the route will be failed immediately.
"""
self._bad_routes.add(self._stack_ident)

def _fail_route(self): def _fail_route(self):
"""Fail the current tokenization route. """Fail the current tokenization route.


@@ -151,7 +159,7 @@ class Tokenizer(object):
:exc:`.BadRoute`. :exc:`.BadRoute`.
""" """
context = self._context context = self._context
self._bad_routes.add(self._stack_ident)
self._memoize_bad_route()
self._pop() self._pop()
raise BadRoute(context) raise BadRoute(context)


@@ -506,12 +514,16 @@ class Tokenizer(object):


def _parse_external_link(self, brackets): def _parse_external_link(self, brackets):
"""Parse an external link at the head of the wikicode string.""" """Parse an external link at the head of the wikicode string."""
if self._context & contexts.NO_EXT_LINKS or not self._can_recurse():
if not brackets and self._context & contexts.DL_TERM:
self._handle_dl_term()
else:
self._emit_text(self._read())
return

reset = self._head reset = self._head
self._head += 1 self._head += 1
try: try:
bad_context = self._context & contexts.NO_EXT_LINKS
if bad_context or not self._can_recurse():
raise BadRoute()
link, extra, delta = self._really_parse_external_link(brackets) link, extra, delta = self._really_parse_external_link(brackets)
except BadRoute: except BadRoute:
self._head = reset self._head = reset
@@ -719,6 +731,7 @@ class Tokenizer(object):
elif data.context & data.CX_NOTE_SPACE: elif data.context & data.CX_NOTE_SPACE:
if data.context & data.CX_QUOTED: if data.context & data.CX_QUOTED:
data.context = data.CX_ATTR_VALUE data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop() self._pop()
self._head = data.reset - 1 # Will be auto-incremented self._head = data.reset - 1 # Will be auto-incremented
return # Break early return # Break early
@@ -743,7 +756,13 @@ class Tokenizer(object):
data.context |= data.CX_QUOTED data.context |= data.CX_QUOTED
data.quoter = chunk data.quoter = chunk
data.reset = self._head data.reset = self._head
self._push(self._context)
try:
self._push(self._context)
except BadRoute:
# Already failed to parse this as a quoted string
data.context = data.CX_ATTR_VALUE
self._head -= 1
return
continue continue
elif data.context & data.CX_QUOTED: elif data.context & data.CX_QUOTED:
if chunk == data.quoter and not escaped: if chunk == data.quoter and not escaped:
@@ -845,6 +864,7 @@ class Tokenizer(object):
if data.context & data.CX_QUOTED: if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die # Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop() self._pop()
self._head = data.reset self._head = data.reset
continue continue
@@ -1084,6 +1104,7 @@ class Tokenizer(object):
if data.context & data.CX_QUOTED: if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die # Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop() self._pop()
self._head = data.reset self._head = data.reset
continue continue


Loading…
Cancel
Save