From 90bd12dd4790f66704fe6189a6ad827dc16425da Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 22 Sep 2015 22:29:13 -0500
Subject: [PATCH] Fix a C tokenizer crash when parsing is interrupted (fixes
 #97)

---
 CHANGELOG                                        |   1 +
 docs/changelog.rst                               |   1 +
 mwparserfromhell/definitions.py                  |   8 +-
 mwparserfromhell/parser/ctokenizer/definitions.c | 131 +++++++++++++++++++++++
 mwparserfromhell/parser/ctokenizer/definitions.h |  39 +++++++
 mwparserfromhell/parser/ctokenizer/tok_parse.c   |  35 ++----
 mwparserfromhell/parser/ctokenizer/tokenizer.c   |  11 +-
 7 files changed, 192 insertions(+), 34 deletions(-)
 create mode 100644 mwparserfromhell/parser/ctokenizer/definitions.c
 create mode 100644 mwparserfromhell/parser/ctokenizer/definitions.h

diff --git a/CHANGELOG b/CHANGELOG
index 462d2dc..e36a281 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
 v0.5 (unreleased):
 
 - Fixed edge cases involving wikilinks inside of external links and vice versa.
+- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing.
 
 v0.4.2 (released July 30, 2015):
 
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 7ca9f29..bd9394a 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -8,6 +8,7 @@ Unreleased
 (`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4.2...develop>`__):
 
 - Fixed edge cases involving wikilinks inside of external links and vice versa.
+- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing.
 
 v0.4.2
 ------
diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py
index cdacb3d..bbfd346 100644
--- a/mwparserfromhell/definitions.py
+++ b/mwparserfromhell/definitions.py
@@ -20,7 +20,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-"""Contains data about certain markup, like HTML tags and external links."""
+"""
+Contains data about certain markup, like HTML tags and external links.
+
+When updating this file, please also update the the C tokenizer version:
+- mwparserfromhell/parser/ctokenizer/definitions.c
+- mwparserfromhell/parser/ctokenizer/definitions.h
+"""
 
 from __future__ import unicode_literals
 
diff --git a/mwparserfromhell/parser/ctokenizer/definitions.c b/mwparserfromhell/parser/ctokenizer/definitions.c
new file mode 100644
index 0000000..38ed649
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/definitions.c
@@ -0,0 +1,131 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "definitions.h"
+
+/*
+    This file should be kept up to date with mwparserfromhell/definitions.py.
+    See the Python version for data sources.
+*/
+
+static const char* URI_SCHEMES[] = {
+    "http", "https", "ftp", "ftps", "ssh", "sftp", "irc", "ircs", "xmpp",
+    "sip", "sips", "gopher", "telnet", "nntp", "worldwind", "mailto", "tel",
+    "sms", "news", "svn", "git", "mms", "bitcoin", "magnet", "urn", "geo", NULL
+};
+
+static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
+    "xmpp", "sip", "sips", "mailto", "tel", "sms", "news", "bitcoin", "magnet",
+    "urn", "geo", NULL
+};
+
+static const char* PARSER_BLACKLIST[] = {
+    "categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
+    "nowiki", "pre", "score", "section", "source", "syntaxhighlight",
+    "templatedata", "timeline", NULL
+};
+
+static const char* SINGLE[] = {
+    "br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL
+};
+
+static const char* SINGLE_ONLY[] = {
+    "br", "hr", "meta", "link", "img", NULL
+};
+
+/*
+    Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in
+    the second argument. The caller must free the return value when finished.
+    If the return value is NULL, the conversion failed and *string is not set.
+*/
+static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string)
+{
+    PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes;
+
+    if (!lower)
+        return NULL;
+    bytes = PyUnicode_AsASCIIString(lower);
+    Py_DECREF(lower);
+    if (!bytes)
+        return NULL;
+    *string = PyBytes_AS_STRING(bytes);
+    return bytes;
+}
+
+/*
+    Return whether a PyUnicodeObject is in a list of lowercase ASCII strings.
+*/
+static int unicode_in_string_list(PyObject *input, const char **list)
+{
+    const char *string;
+    PyObject *temp = unicode_to_lcase_ascii(input, &string);
+
+    if (!temp)
+        return 0;
+
+    int retval = 0;
+    while (*list) {
+        if (!strcmp(*(list++), string)) {
+            retval = 1;
+            goto end;
+        }
+    }
+
+    end:
+    Py_DECREF(temp);
+    return retval;
+}
+
+/*
+    Return if the given tag's contents should be passed to the parser.
+*/
+int is_parsable(PyObject *tag)
+{
+    return !unicode_in_string_list(tag, PARSER_BLACKLIST);
+}
+
+/*
+    Return whether or not the given tag can exist without a close tag.
+*/
+int is_single(PyObject *tag)
+{
+    return unicode_in_string_list(tag, SINGLE);
+}
+
+/*
+    Return whether or not the given tag must exist without a close tag.
+*/
+int is_single_only(PyObject *tag)
+{
+    return unicode_in_string_list(tag, SINGLE_ONLY);
+}
+
+/*
+    Return whether the given scheme is valid for external links.
+*/
+int is_scheme(PyObject *scheme, int slashes)
+{
+    if (slashes)
+        return unicode_in_string_list(scheme, URI_SCHEMES);
+    else
+        return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL);
+}
diff --git a/mwparserfromhell/parser/ctokenizer/definitions.h b/mwparserfromhell/parser/ctokenizer/definitions.h
new file mode 100644
index 0000000..8f8dc2c
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/definitions.h
@@ -0,0 +1,39 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include "common.h"
+
+/* This file should be kept up to date with mwparserfromhell/definitions.py. */
+
+/* Functions */
+
+int is_parsable(PyObject*);
+int is_single(PyObject*);
+int is_single_only(PyObject*);
+int is_scheme(PyObject*, int);
+
+/* Macros */
+
+#define GET_HTML_TAG(markup)                                                  \
+    (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
index 60eef6e..4bb65b4 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -22,6 +22,7 @@ SOFTWARE.
 
 #include "tok_parse.h"
 #include "contexts.h"
+#include "definitions.h"
 #include "tag_data.h"
 #include "tok_support.h"
 #include "tokens.h"
@@ -33,13 +34,6 @@ SOFTWARE.
 #define MAX_BRACES 255
 #define MAX_ENTITY_SIZE 8
 
-#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
-#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL))
-#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL))
-#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL))
-#define IS_SCHEME(scheme, slashes) \
-    (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False))
-
 typedef struct {
     PyObject* title;
     int level;
@@ -82,21 +76,6 @@ static int heading_level_from_context(uint64_t n)
 }
 
 /*
-    Call the given function in definitions.py, using 'in1' and 'in2' as
-    parameters, and return its output as a bool.
-*/
-static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2)
-{
-    PyObject* func = PyObject_GetAttrString(definitions, funcname);
-    PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL);
-    int ans = (result == Py_True) ? 1 : 0;
-
-    Py_DECREF(func);
-    Py_DECREF(result);
-    return ans;
-}
-
-/*
     Sanitize the name of a tag so it can be compared with others for equality.
 */
 static PyObject* strip_tag_name(PyObject* token, int take_attr)
@@ -516,7 +495,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
         Textbuffer_dealloc(buffer);
         if (!scheme)
             return -1;
-        if (!IS_SCHEME(scheme, slashes)) {
+        if (!is_scheme(scheme, slashes)) {
             Py_DECREF(scheme);
             Tokenizer_fail_route(self);
             return 0;
@@ -565,7 +544,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
     }
     slashes = (Tokenizer_read(self, 0) == '/' &&
                Tokenizer_read(self, 1) == '/');
-    if (!IS_SCHEME(scheme, slashes)) {
+    if (!is_scheme(scheme, slashes)) {
         Py_DECREF(scheme);
         Textbuffer_dealloc(scheme_buffer);
         FAIL_ROUTE(0);
@@ -1634,11 +1613,11 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
             text = PyObject_GetAttrString(token, "text");
             if (!text)
                 return NULL;
-            if (IS_SINGLE_ONLY(text)) {
+            if (is_single_only(text)) {
                 Py_DECREF(text);
                 return Tokenizer_handle_single_only_tag_end(self);
             }
-            if (IS_PARSABLE(text)) {
+            if (is_parsable(text)) {
                 Py_DECREF(text);
                 return Tokenizer_parse(self, 0, 0);
             }
@@ -1686,7 +1665,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
                 Textbuffer_dealloc(buf);
                 return -1;
             }
-            if (!IS_SINGLE_ONLY(name))
+            if (!is_single_only(name))
                 FAIL_ROUTE(0);
             Py_DECREF(name);
             break;
@@ -2428,7 +2407,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
             text = PyObject_GetAttrString(token, "text");
             if (!text)
                 return NULL;
-            single = IS_SINGLE(text);
+            single = is_single(text);
             Py_DECREF(text);
             if (single)
                 return Tokenizer_handle_single_tag_end(self);
diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c
index 2b3d321..3d751db 100644
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -162,11 +162,12 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
     self->skip_style_tags = skip_style_tags;
     tokens = Tokenizer_parse(self, context, 1);
 
-    if ((!tokens && !PyErr_Occurred()) || self->topstack) {
-        if (!ParserError) {
-            if (load_exceptions())
-                return NULL;
-        }
+    if (!tokens || self->topstack) {
+        Py_XDECREF(tokens);
+        if (PyErr_Occurred())
+            return NULL;
+        if (!ParserError && load_exceptions() < 0)
+            return NULL;
         if (BAD_ROUTE) {
             RESET_ROUTE();
             PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");