From 90bd12dd4790f66704fe6189a6ad827dc16425da Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 22 Sep 2015 22:29:13 -0500 Subject: [PATCH] Fix a C tokenizer crash when parsing is interrupted (fixes #97) --- CHANGELOG | 1 + docs/changelog.rst | 1 + mwparserfromhell/definitions.py | 8 +- mwparserfromhell/parser/ctokenizer/definitions.c | 131 +++++++++++++++++++++++ mwparserfromhell/parser/ctokenizer/definitions.h | 39 +++++++ mwparserfromhell/parser/ctokenizer/tok_parse.c | 35 ++---- mwparserfromhell/parser/ctokenizer/tokenizer.c | 11 +- 7 files changed, 192 insertions(+), 34 deletions(-) create mode 100644 mwparserfromhell/parser/ctokenizer/definitions.c create mode 100644 mwparserfromhell/parser/ctokenizer/definitions.h diff --git a/CHANGELOG b/CHANGELOG index 462d2dc..e36a281 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ v0.5 (unreleased): - Fixed edge cases involving wikilinks inside of external links and vice versa. +- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing. v0.4.2 (released July 30, 2015): diff --git a/docs/changelog.rst b/docs/changelog.rst index 7ca9f29..bd9394a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,6 +8,7 @@ Unreleased (`changes `__): - Fixed edge cases involving wikilinks inside of external links and vice versa. +- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing. v0.4.2 ------ diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index cdacb3d..bbfd346 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -20,7 +20,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -"""Contains data about certain markup, like HTML tags and external links.""" +""" +Contains data about certain markup, like HTML tags and external links. + +When updating this file, please also update the the C tokenizer version: +- mwparserfromhell/parser/ctokenizer/definitions.c +- mwparserfromhell/parser/ctokenizer/definitions.h +""" from __future__ import unicode_literals diff --git a/mwparserfromhell/parser/ctokenizer/definitions.c b/mwparserfromhell/parser/ctokenizer/definitions.c new file mode 100644 index 0000000..38ed649 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/definitions.c @@ -0,0 +1,131 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "definitions.h" + +/* + This file should be kept up to date with mwparserfromhell/definitions.py. + See the Python version for data sources. +*/ + +static const char* URI_SCHEMES[] = { + "http", "https", "ftp", "ftps", "ssh", "sftp", "irc", "ircs", "xmpp", + "sip", "sips", "gopher", "telnet", "nntp", "worldwind", "mailto", "tel", + "sms", "news", "svn", "git", "mms", "bitcoin", "magnet", "urn", "geo", NULL +}; + +static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = { + "xmpp", "sip", "sips", "mailto", "tel", "sms", "news", "bitcoin", "magnet", + "urn", "geo", NULL +}; + +static const char* PARSER_BLACKLIST[] = { + "categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", + "nowiki", "pre", "score", "section", "source", "syntaxhighlight", + "templatedata", "timeline", NULL +}; + +static const char* SINGLE[] = { + "br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL +}; + +static const char* SINGLE_ONLY[] = { + "br", "hr", "meta", "link", "img", NULL +}; + +/* + Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in + the second argument. The caller must free the return value when finished. + If the return value is NULL, the conversion failed and *string is not set. +*/ +static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string) +{ + PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes; + + if (!lower) + return NULL; + bytes = PyUnicode_AsASCIIString(lower); + Py_DECREF(lower); + if (!bytes) + return NULL; + *string = PyBytes_AS_STRING(bytes); + return bytes; +} + +/* + Return whether a PyUnicodeObject is in a list of lowercase ASCII strings. +*/ +static int unicode_in_string_list(PyObject *input, const char **list) +{ + const char *string; + PyObject *temp = unicode_to_lcase_ascii(input, &string); + + if (!temp) + return 0; + + int retval = 0; + while (*list) { + if (!strcmp(*(list++), string)) { + retval = 1; + goto end; + } + } + + end: + Py_DECREF(temp); + return retval; +} + +/* + Return if the given tag's contents should be passed to the parser. +*/ +int is_parsable(PyObject *tag) +{ + return !unicode_in_string_list(tag, PARSER_BLACKLIST); +} + +/* + Return whether or not the given tag can exist without a close tag. +*/ +int is_single(PyObject *tag) +{ + return unicode_in_string_list(tag, SINGLE); +} + +/* + Return whether or not the given tag must exist without a close tag. +*/ +int is_single_only(PyObject *tag) +{ + return unicode_in_string_list(tag, SINGLE_ONLY); +} + +/* + Return whether the given scheme is valid for external links. +*/ +int is_scheme(PyObject *scheme, int slashes) +{ + if (slashes) + return unicode_in_string_list(scheme, URI_SCHEMES); + else + return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL); +} diff --git a/mwparserfromhell/parser/ctokenizer/definitions.h b/mwparserfromhell/parser/ctokenizer/definitions.h new file mode 100644 index 0000000..8f8dc2c --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/definitions.h @@ -0,0 +1,39 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#include "common.h" + +/* This file should be kept up to date with mwparserfromhell/definitions.py. */ + +/* Functions */ + +int is_parsable(PyObject*); +int is_single(PyObject*); +int is_single_only(PyObject*); +int is_scheme(PyObject*, int); + +/* Macros */ + +#define GET_HTML_TAG(markup) \ + (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 60eef6e..4bb65b4 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -22,6 +22,7 @@ SOFTWARE. #include "tok_parse.h" #include "contexts.h" +#include "definitions.h" #include "tag_data.h" #include "tok_support.h" #include "tokens.h" @@ -33,13 +34,6 @@ SOFTWARE. #define MAX_BRACES 255 #define MAX_ENTITY_SIZE 8 -#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") -#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) -#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) -#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) -#define IS_SCHEME(scheme, slashes) \ - (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False)) - typedef struct { PyObject* title; int level; @@ -82,21 +76,6 @@ static int heading_level_from_context(uint64_t n) } /* - Call the given function in definitions.py, using 'in1' and 'in2' as - parameters, and return its output as a bool. -*/ -static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) -{ - PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); - int ans = (result == Py_True) ? 1 : 0; - - Py_DECREF(func); - Py_DECREF(result); - return ans; -} - -/* Sanitize the name of a tag so it can be compared with others for equality. */ static PyObject* strip_tag_name(PyObject* token, int take_attr) @@ -516,7 +495,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) Textbuffer_dealloc(buffer); if (!scheme) return -1; - if (!IS_SCHEME(scheme, slashes)) { + if (!is_scheme(scheme, slashes)) { Py_DECREF(scheme); Tokenizer_fail_route(self); return 0; @@ -565,7 +544,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) } slashes = (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/'); - if (!IS_SCHEME(scheme, slashes)) { + if (!is_scheme(scheme, slashes)) { Py_DECREF(scheme); Textbuffer_dealloc(scheme_buffer); FAIL_ROUTE(0); @@ -1634,11 +1613,11 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) text = PyObject_GetAttrString(token, "text"); if (!text) return NULL; - if (IS_SINGLE_ONLY(text)) { + if (is_single_only(text)) { Py_DECREF(text); return Tokenizer_handle_single_only_tag_end(self); } - if (IS_PARSABLE(text)) { + if (is_parsable(text)) { Py_DECREF(text); return Tokenizer_parse(self, 0, 0); } @@ -1686,7 +1665,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) Textbuffer_dealloc(buf); return -1; } - if (!IS_SINGLE_ONLY(name)) + if (!is_single_only(name)) FAIL_ROUTE(0); Py_DECREF(name); break; @@ -2428,7 +2407,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) text = PyObject_GetAttrString(token, "text"); if (!text) return NULL; - single = IS_SINGLE(text); + single = is_single(text); Py_DECREF(text); if (single) return Tokenizer_handle_single_tag_end(self); diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c index 2b3d321..3d751db 100644 --- a/mwparserfromhell/parser/ctokenizer/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c @@ -162,11 +162,12 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) self->skip_style_tags = skip_style_tags; tokens = Tokenizer_parse(self, context, 1); - if ((!tokens && !PyErr_Occurred()) || self->topstack) { - if (!ParserError) { - if (load_exceptions()) - return NULL; - } + if (!tokens || self->topstack) { + Py_XDECREF(tokens); + if (PyErr_Occurred()) + return NULL; + if (!ParserError && load_exceptions() < 0) + return NULL; if (BAD_ROUTE) { RESET_ROUTE(); PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");