@@ -1,7 +1,9 @@ | |||
v0.6 (unreleased): | |||
- Improved behavior when adding parameters to templates (via Template.add()) | |||
with poorly formatted whitespace conventions. | |||
with poorly formatted whitespace conventions. (#185) | |||
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, | |||
quoted attributes. (#190) | |||
v0.5 (released June 23, 2017): | |||
@@ -1,4 +1,4 @@ | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
@@ -9,6 +9,10 @@ Unreleased | |||
- Improved behavior when adding parameters to templates (via | |||
:meth:`.Template.add`) with poorly formatted whitespace conventions. | |||
(`#185 <https://github.com/earwig/mwparserfromhell/issues/185>`_) | |||
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, | |||
quoted attributes. | |||
(`#190 <https://github.com/earwig/mwparserfromhell/issues/190>`_) | |||
v0.5 | |||
---- | |||
@@ -42,7 +42,7 @@ master_doc = 'index' | |||
# General information about the project. | |||
project = u'mwparserfromhell' | |||
copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic' | |||
copyright = u'2012–2018 Ben Kurtovic' | |||
# The version info for the project you're documenting, acts as replacement for | |||
# |version| and |release|, also used in various other places throughout the | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode. | |||
""" | |||
__author__ = "Ben Kurtovic" | |||
__copyright__ = "Copyright (C) 2012-2017 Ben Kurtovic" | |||
__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic" | |||
__license__ = "MIT License" | |||
__version__ = "0.6.dev0" | |||
__email__ = "ben.kurtovic@gmail.com" | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) | |||
*/ | |||
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | |||
{ | |||
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS | |||
#define NOT_A_LINK \ | |||
if (!brackets && self->topstack->context & LC_DLTERM) \ | |||
return Tokenizer_handle_dl_term(self); \ | |||
@@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | |||
PyObject *link, *kwargs; | |||
Textbuffer *extra; | |||
if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { | |||
if (self->topstack->context & AGG_NO_EXT_LINKS || | |||
!(Tokenizer_CAN_RECURSE(self))) { | |||
NOT_A_LINK; | |||
} | |||
extra = Textbuffer_new(&self->text); | |||
@@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data( | |||
else if (data->context & TAG_NOTE_SPACE) { | |||
if (data->context & TAG_QUOTED) { | |||
data->context = TAG_ATTR_VALUE; | |||
Tokenizer_memoize_bad_route(self); | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = data->reset - 1; // Will be auto-incremented | |||
@@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data( | |||
data->context |= TAG_QUOTED; | |||
data->quoter = chunk; | |||
data->reset = self->head; | |||
if (Tokenizer_push(self, self->topstack->context)) | |||
if (Tokenizer_check_route(self, self->topstack->context) < 0) { | |||
RESET_ROUTE(); | |||
data->context = TAG_ATTR_VALUE; | |||
self->head--; | |||
} | |||
else if (Tokenizer_push(self, self->topstack->context)) | |||
return -1; | |||
return 0; | |||
} | |||
@@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) | |||
if (data->context & TAG_QUOTED) { | |||
// Unclosed attribute quote: reset, don't die | |||
data->context = TAG_ATTR_VALUE; | |||
Tokenizer_memoize_bad_route(self); | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = data->reset; | |||
@@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token | |||
if (data->context & TAG_QUOTED) { | |||
// Unclosed attribute quote: reset, don't die | |||
data->context = TAG_ATTR_VALUE; | |||
Tokenizer_memoize_bad_route(self); | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = data->reset; | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -147,6 +147,22 @@ static int compare_nodes( | |||
} | |||
/* | |||
Remember that the current route (head + context at push) is invalid. | |||
This will be noticed when calling Tokenizer_check_route with the same head | |||
and context, and the route will be failed immediately. | |||
*/ | |||
void Tokenizer_memoize_bad_route(Tokenizer *self) | |||
{ | |||
route_tree_node *node = malloc(sizeof(route_tree_node)); | |||
if (node) { | |||
node->id = self->topstack->ident; | |||
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) | |||
free(node); | |||
} | |||
} | |||
/* | |||
Fail the current tokenization route. Discards the current | |||
stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the | |||
ident of the failed stack so future parsing attempts down this route can be | |||
@@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self) | |||
uint64_t context = self->topstack->context; | |||
PyObject* stack; | |||
route_tree_node *node = malloc(sizeof(route_tree_node)); | |||
if (node) { | |||
node->id = self->topstack->ident; | |||
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) | |||
free(node); | |||
} | |||
Tokenizer_memoize_bad_route(self); | |||
stack = Tokenizer_pop(self); | |||
Py_XDECREF(stack); | |||
FAIL_ROUTE(context); | |||
@@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self) | |||
/* | |||
Check if pushing a new route here with the given context would definitely | |||
fail, based on a previous call to Tokenizer_fail_route() with the same | |||
stack. | |||
stack. (Or any other call to Tokenizer_memoize_bad_route().) | |||
Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the | |||
latter case. | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*); | |||
void Tokenizer_delete_top_of_stack(Tokenizer*); | |||
PyObject* Tokenizer_pop(Tokenizer*); | |||
PyObject* Tokenizer_pop_keeping_context(Tokenizer*); | |||
void Tokenizer_memoize_bad_route(Tokenizer*); | |||
void* Tokenizer_fail_route(Tokenizer*); | |||
int Tokenizer_check_route(Tokenizer*, uint64_t); | |||
void Tokenizer_free_bad_route_tree(Tokenizer*); | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -144,6 +144,14 @@ class Tokenizer(object): | |||
"""Return whether or not our max recursion depth has been exceeded.""" | |||
return self._depth < self.MAX_DEPTH | |||
def _memoize_bad_route(self): | |||
"""Remember that the current route (head + context at push) is invalid. | |||
This will be noticed when calling _push with the same head and context, | |||
and the route will be failed immediately. | |||
""" | |||
self._bad_routes.add(self._stack_ident) | |||
def _fail_route(self): | |||
"""Fail the current tokenization route. | |||
@@ -151,7 +159,7 @@ class Tokenizer(object): | |||
:exc:`.BadRoute`. | |||
""" | |||
context = self._context | |||
self._bad_routes.add(self._stack_ident) | |||
self._memoize_bad_route() | |||
self._pop() | |||
raise BadRoute(context) | |||
@@ -506,12 +514,16 @@ class Tokenizer(object): | |||
def _parse_external_link(self, brackets): | |||
"""Parse an external link at the head of the wikicode string.""" | |||
if self._context & contexts.NO_EXT_LINKS or not self._can_recurse(): | |||
if not brackets and self._context & contexts.DL_TERM: | |||
self._handle_dl_term() | |||
else: | |||
self._emit_text(self._read()) | |||
return | |||
reset = self._head | |||
self._head += 1 | |||
try: | |||
bad_context = self._context & contexts.NO_EXT_LINKS | |||
if bad_context or not self._can_recurse(): | |||
raise BadRoute() | |||
link, extra, delta = self._really_parse_external_link(brackets) | |||
except BadRoute: | |||
self._head = reset | |||
@@ -719,6 +731,7 @@ class Tokenizer(object): | |||
elif data.context & data.CX_NOTE_SPACE: | |||
if data.context & data.CX_QUOTED: | |||
data.context = data.CX_ATTR_VALUE | |||
self._memoize_bad_route() | |||
self._pop() | |||
self._head = data.reset - 1 # Will be auto-incremented | |||
return # Break early | |||
@@ -743,7 +756,13 @@ class Tokenizer(object): | |||
data.context |= data.CX_QUOTED | |||
data.quoter = chunk | |||
data.reset = self._head | |||
self._push(self._context) | |||
try: | |||
self._push(self._context) | |||
except BadRoute: | |||
# Already failed to parse this as a quoted string | |||
data.context = data.CX_ATTR_VALUE | |||
self._head -= 1 | |||
return | |||
continue | |||
elif data.context & data.CX_QUOTED: | |||
if chunk == data.quoter and not escaped: | |||
@@ -845,6 +864,7 @@ class Tokenizer(object): | |||
if data.context & data.CX_QUOTED: | |||
# Unclosed attribute quote: reset, don't die | |||
data.context = data.CX_ATTR_VALUE | |||
self._memoize_bad_route() | |||
self._pop() | |||
self._head = data.reset | |||
continue | |||
@@ -1084,6 +1104,7 @@ class Tokenizer(object): | |||
if data.context & data.CX_QUOTED: | |||
# Unclosed attribute quote: reset, don't die | |||
data.context = data.CX_ATTR_VALUE | |||
self._memoize_bad_route() | |||
self._pop() | |||
self._head = data.reset | |||
continue | |||