@@ -1,7 +1,9 @@ | |||||
v0.6 (unreleased): | v0.6 (unreleased): | ||||
- Improved behavior when adding parameters to templates (via Template.add()) | - Improved behavior when adding parameters to templates (via Template.add()) | ||||
with poorly formatted whitespace conventions. | |||||
with poorly formatted whitespace conventions. (#185) | |||||
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, | |||||
quoted attributes. (#190) | |||||
v0.5 (released June 23, 2017): | v0.5 (released June 23, 2017): | ||||
@@ -1,4 +1,4 @@ | |||||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy | Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
of this software and associated documentation files (the "Software"), to deal | of this software and associated documentation files (the "Software"), to deal | ||||
@@ -9,6 +9,10 @@ Unreleased | |||||
- Improved behavior when adding parameters to templates (via | - Improved behavior when adding parameters to templates (via | ||||
:meth:`.Template.add`) with poorly formatted whitespace conventions. | :meth:`.Template.add`) with poorly formatted whitespace conventions. | ||||
(`#185 <https://github.com/earwig/mwparserfromhell/issues/185>`_) | |||||
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, | |||||
quoted attributes. | |||||
(`#190 <https://github.com/earwig/mwparserfromhell/issues/190>`_) | |||||
v0.5 | v0.5 | ||||
---- | ---- | ||||
@@ -42,7 +42,7 @@ master_doc = 'index' | |||||
# General information about the project. | # General information about the project. | ||||
project = u'mwparserfromhell' | project = u'mwparserfromhell' | ||||
copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic' | |||||
copyright = u'2012–2018 Ben Kurtovic' | |||||
# The version info for the project you're documenting, acts as replacement for | # The version info for the project you're documenting, acts as replacement for | ||||
# |version| and |release|, also used in various other places throughout the | # |version| and |release|, also used in various other places throughout the | ||||
@@ -1,6 +1,6 @@ | |||||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
# | # | ||||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
# | # | ||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | # Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
# of this software and associated documentation files (the "Software"), to deal | # of this software and associated documentation files (the "Software"), to deal | ||||
@@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode. | |||||
""" | """ | ||||
__author__ = "Ben Kurtovic" | __author__ = "Ben Kurtovic" | ||||
__copyright__ = "Copyright (C) 2012-2017 Ben Kurtovic" | |||||
__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic" | |||||
__license__ = "MIT License" | __license__ = "MIT License" | ||||
__version__ = "0.6.dev0" | __version__ = "0.6.dev0" | ||||
__email__ = "ben.kurtovic@gmail.com" | __email__ = "ben.kurtovic@gmail.com" | ||||
@@ -1,5 +1,5 @@ | |||||
/* | /* | ||||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||
this software and associated documentation files (the "Software"), to deal in | this software and associated documentation files (the "Software"), to deal in | ||||
@@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) | |||||
*/ | */ | ||||
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | ||||
{ | { | ||||
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS | |||||
#define NOT_A_LINK \ | #define NOT_A_LINK \ | ||||
if (!brackets && self->topstack->context & LC_DLTERM) \ | if (!brackets && self->topstack->context & LC_DLTERM) \ | ||||
return Tokenizer_handle_dl_term(self); \ | return Tokenizer_handle_dl_term(self); \ | ||||
@@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | |||||
PyObject *link, *kwargs; | PyObject *link, *kwargs; | ||||
Textbuffer *extra; | Textbuffer *extra; | ||||
if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { | |||||
if (self->topstack->context & AGG_NO_EXT_LINKS || | |||||
!(Tokenizer_CAN_RECURSE(self))) { | |||||
NOT_A_LINK; | NOT_A_LINK; | ||||
} | } | ||||
extra = Textbuffer_new(&self->text); | extra = Textbuffer_new(&self->text); | ||||
@@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data( | |||||
else if (data->context & TAG_NOTE_SPACE) { | else if (data->context & TAG_NOTE_SPACE) { | ||||
if (data->context & TAG_QUOTED) { | if (data->context & TAG_QUOTED) { | ||||
data->context = TAG_ATTR_VALUE; | data->context = TAG_ATTR_VALUE; | ||||
Tokenizer_memoize_bad_route(self); | |||||
trash = Tokenizer_pop(self); | trash = Tokenizer_pop(self); | ||||
Py_XDECREF(trash); | Py_XDECREF(trash); | ||||
self->head = data->reset - 1; // Will be auto-incremented | self->head = data->reset - 1; // Will be auto-incremented | ||||
@@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data( | |||||
data->context |= TAG_QUOTED; | data->context |= TAG_QUOTED; | ||||
data->quoter = chunk; | data->quoter = chunk; | ||||
data->reset = self->head; | data->reset = self->head; | ||||
if (Tokenizer_push(self, self->topstack->context)) | |||||
if (Tokenizer_check_route(self, self->topstack->context) < 0) { | |||||
RESET_ROUTE(); | |||||
data->context = TAG_ATTR_VALUE; | |||||
self->head--; | |||||
} | |||||
else if (Tokenizer_push(self, self->topstack->context)) | |||||
return -1; | return -1; | ||||
return 0; | return 0; | ||||
} | } | ||||
@@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) | |||||
if (data->context & TAG_QUOTED) { | if (data->context & TAG_QUOTED) { | ||||
// Unclosed attribute quote: reset, don't die | // Unclosed attribute quote: reset, don't die | ||||
data->context = TAG_ATTR_VALUE; | data->context = TAG_ATTR_VALUE; | ||||
Tokenizer_memoize_bad_route(self); | |||||
trash = Tokenizer_pop(self); | trash = Tokenizer_pop(self); | ||||
Py_XDECREF(trash); | Py_XDECREF(trash); | ||||
self->head = data->reset; | self->head = data->reset; | ||||
@@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token | |||||
if (data->context & TAG_QUOTED) { | if (data->context & TAG_QUOTED) { | ||||
// Unclosed attribute quote: reset, don't die | // Unclosed attribute quote: reset, don't die | ||||
data->context = TAG_ATTR_VALUE; | data->context = TAG_ATTR_VALUE; | ||||
Tokenizer_memoize_bad_route(self); | |||||
trash = Tokenizer_pop(self); | trash = Tokenizer_pop(self); | ||||
Py_XDECREF(trash); | Py_XDECREF(trash); | ||||
self->head = data->reset; | self->head = data->reset; | ||||
@@ -1,5 +1,5 @@ | |||||
/* | /* | ||||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||
this software and associated documentation files (the "Software"), to deal in | this software and associated documentation files (the "Software"), to deal in | ||||
@@ -147,6 +147,22 @@ static int compare_nodes( | |||||
} | } | ||||
/* | /* | ||||
Remember that the current route (head + context at push) is invalid. | |||||
This will be noticed when calling Tokenizer_check_route with the same head | |||||
and context, and the route will be failed immediately. | |||||
*/ | |||||
void Tokenizer_memoize_bad_route(Tokenizer *self) | |||||
{ | |||||
route_tree_node *node = malloc(sizeof(route_tree_node)); | |||||
if (node) { | |||||
node->id = self->topstack->ident; | |||||
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) | |||||
free(node); | |||||
} | |||||
} | |||||
/* | |||||
Fail the current tokenization route. Discards the current | Fail the current tokenization route. Discards the current | ||||
stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the | stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the | ||||
ident of the failed stack so future parsing attempts down this route can be | ident of the failed stack so future parsing attempts down this route can be | ||||
@@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self) | |||||
uint64_t context = self->topstack->context; | uint64_t context = self->topstack->context; | ||||
PyObject* stack; | PyObject* stack; | ||||
route_tree_node *node = malloc(sizeof(route_tree_node)); | |||||
if (node) { | |||||
node->id = self->topstack->ident; | |||||
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) | |||||
free(node); | |||||
} | |||||
Tokenizer_memoize_bad_route(self); | |||||
stack = Tokenizer_pop(self); | stack = Tokenizer_pop(self); | ||||
Py_XDECREF(stack); | Py_XDECREF(stack); | ||||
FAIL_ROUTE(context); | FAIL_ROUTE(context); | ||||
@@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self) | |||||
/* | /* | ||||
Check if pushing a new route here with the given context would definitely | Check if pushing a new route here with the given context would definitely | ||||
fail, based on a previous call to Tokenizer_fail_route() with the same | fail, based on a previous call to Tokenizer_fail_route() with the same | ||||
stack. | |||||
stack. (Or any other call to Tokenizer_memoize_bad_route().) | |||||
Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the | Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the | ||||
latter case. | latter case. | ||||
@@ -1,5 +1,5 @@ | |||||
/* | /* | ||||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||
this software and associated documentation files (the "Software"), to deal in | this software and associated documentation files (the "Software"), to deal in | ||||
@@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*); | |||||
void Tokenizer_delete_top_of_stack(Tokenizer*); | void Tokenizer_delete_top_of_stack(Tokenizer*); | ||||
PyObject* Tokenizer_pop(Tokenizer*); | PyObject* Tokenizer_pop(Tokenizer*); | ||||
PyObject* Tokenizer_pop_keeping_context(Tokenizer*); | PyObject* Tokenizer_pop_keeping_context(Tokenizer*); | ||||
void Tokenizer_memoize_bad_route(Tokenizer*); | |||||
void* Tokenizer_fail_route(Tokenizer*); | void* Tokenizer_fail_route(Tokenizer*); | ||||
int Tokenizer_check_route(Tokenizer*, uint64_t); | int Tokenizer_check_route(Tokenizer*, uint64_t); | ||||
void Tokenizer_free_bad_route_tree(Tokenizer*); | void Tokenizer_free_bad_route_tree(Tokenizer*); | ||||
@@ -1,6 +1,6 @@ | |||||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
# | # | ||||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
# | # | ||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | # Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
# of this software and associated documentation files (the "Software"), to deal | # of this software and associated documentation files (the "Software"), to deal | ||||
@@ -144,6 +144,14 @@ class Tokenizer(object): | |||||
"""Return whether or not our max recursion depth has been exceeded.""" | """Return whether or not our max recursion depth has been exceeded.""" | ||||
return self._depth < self.MAX_DEPTH | return self._depth < self.MAX_DEPTH | ||||
def _memoize_bad_route(self): | |||||
"""Remember that the current route (head + context at push) is invalid. | |||||
This will be noticed when calling _push with the same head and context, | |||||
and the route will be failed immediately. | |||||
""" | |||||
self._bad_routes.add(self._stack_ident) | |||||
def _fail_route(self): | def _fail_route(self): | ||||
"""Fail the current tokenization route. | """Fail the current tokenization route. | ||||
@@ -151,7 +159,7 @@ class Tokenizer(object): | |||||
:exc:`.BadRoute`. | :exc:`.BadRoute`. | ||||
""" | """ | ||||
context = self._context | context = self._context | ||||
self._bad_routes.add(self._stack_ident) | |||||
self._memoize_bad_route() | |||||
self._pop() | self._pop() | ||||
raise BadRoute(context) | raise BadRoute(context) | ||||
@@ -506,12 +514,16 @@ class Tokenizer(object): | |||||
def _parse_external_link(self, brackets): | def _parse_external_link(self, brackets): | ||||
"""Parse an external link at the head of the wikicode string.""" | """Parse an external link at the head of the wikicode string.""" | ||||
if self._context & contexts.NO_EXT_LINKS or not self._can_recurse(): | |||||
if not brackets and self._context & contexts.DL_TERM: | |||||
self._handle_dl_term() | |||||
else: | |||||
self._emit_text(self._read()) | |||||
return | |||||
reset = self._head | reset = self._head | ||||
self._head += 1 | self._head += 1 | ||||
try: | try: | ||||
bad_context = self._context & contexts.NO_EXT_LINKS | |||||
if bad_context or not self._can_recurse(): | |||||
raise BadRoute() | |||||
link, extra, delta = self._really_parse_external_link(brackets) | link, extra, delta = self._really_parse_external_link(brackets) | ||||
except BadRoute: | except BadRoute: | ||||
self._head = reset | self._head = reset | ||||
@@ -719,6 +731,7 @@ class Tokenizer(object): | |||||
elif data.context & data.CX_NOTE_SPACE: | elif data.context & data.CX_NOTE_SPACE: | ||||
if data.context & data.CX_QUOTED: | if data.context & data.CX_QUOTED: | ||||
data.context = data.CX_ATTR_VALUE | data.context = data.CX_ATTR_VALUE | ||||
self._memoize_bad_route() | |||||
self._pop() | self._pop() | ||||
self._head = data.reset - 1 # Will be auto-incremented | self._head = data.reset - 1 # Will be auto-incremented | ||||
return # Break early | return # Break early | ||||
@@ -743,7 +756,13 @@ class Tokenizer(object): | |||||
data.context |= data.CX_QUOTED | data.context |= data.CX_QUOTED | ||||
data.quoter = chunk | data.quoter = chunk | ||||
data.reset = self._head | data.reset = self._head | ||||
self._push(self._context) | |||||
try: | |||||
self._push(self._context) | |||||
except BadRoute: | |||||
# Already failed to parse this as a quoted string | |||||
data.context = data.CX_ATTR_VALUE | |||||
self._head -= 1 | |||||
return | |||||
continue | continue | ||||
elif data.context & data.CX_QUOTED: | elif data.context & data.CX_QUOTED: | ||||
if chunk == data.quoter and not escaped: | if chunk == data.quoter and not escaped: | ||||
@@ -845,6 +864,7 @@ class Tokenizer(object): | |||||
if data.context & data.CX_QUOTED: | if data.context & data.CX_QUOTED: | ||||
# Unclosed attribute quote: reset, don't die | # Unclosed attribute quote: reset, don't die | ||||
data.context = data.CX_ATTR_VALUE | data.context = data.CX_ATTR_VALUE | ||||
self._memoize_bad_route() | |||||
self._pop() | self._pop() | ||||
self._head = data.reset | self._head = data.reset | ||||
continue | continue | ||||
@@ -1084,6 +1104,7 @@ class Tokenizer(object): | |||||
if data.context & data.CX_QUOTED: | if data.context & data.CX_QUOTED: | ||||
# Unclosed attribute quote: reset, don't die | # Unclosed attribute quote: reset, don't die | ||||
data.context = data.CX_ATTR_VALUE | data.context = data.CX_ATTR_VALUE | ||||
self._memoize_bad_route() | |||||
self._pop() | self._pop() | ||||
self._head = data.reset | self._head = data.reset | ||||
continue | continue | ||||