@@ -1,6 +1,7 @@ | |||||
v0.5 (unreleased): | v0.5 (unreleased): | ||||
- Fixed edge cases involving wikilinks inside of external links and vice versa. | - Fixed edge cases involving wikilinks inside of external links and vice versa. | ||||
- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing. | |||||
v0.4.2 (released July 30, 2015): | v0.4.2 (released July 30, 2015): | ||||
@@ -8,6 +8,7 @@ Unreleased | |||||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4.2...develop>`__): | (`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4.2...develop>`__): | ||||
- Fixed edge cases involving wikilinks inside of external links and vice versa. | - Fixed edge cases involving wikilinks inside of external links and vice versa. | ||||
- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing. | |||||
v0.4.2 | v0.4.2 | ||||
------ | ------ | ||||
@@ -20,7 +20,13 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
"""Contains data about certain markup, like HTML tags and external links.""" | |||||
""" | |||||
Contains data about certain markup, like HTML tags and external links. | |||||
When updating this file, please also update the the C tokenizer version: | |||||
- mwparserfromhell/parser/ctokenizer/definitions.c | |||||
- mwparserfromhell/parser/ctokenizer/definitions.h | |||||
""" | |||||
from __future__ import unicode_literals | from __future__ import unicode_literals | ||||
@@ -0,0 +1,131 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#include "definitions.h" | |||||
/* | |||||
This file should be kept up to date with mwparserfromhell/definitions.py. | |||||
See the Python version for data sources. | |||||
*/ | |||||
static const char* URI_SCHEMES[] = { | |||||
"http", "https", "ftp", "ftps", "ssh", "sftp", "irc", "ircs", "xmpp", | |||||
"sip", "sips", "gopher", "telnet", "nntp", "worldwind", "mailto", "tel", | |||||
"sms", "news", "svn", "git", "mms", "bitcoin", "magnet", "urn", "geo", NULL | |||||
}; | |||||
static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = { | |||||
"xmpp", "sip", "sips", "mailto", "tel", "sms", "news", "bitcoin", "magnet", | |||||
"urn", "geo", NULL | |||||
}; | |||||
static const char* PARSER_BLACKLIST[] = { | |||||
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", | |||||
"nowiki", "pre", "score", "section", "source", "syntaxhighlight", | |||||
"templatedata", "timeline", NULL | |||||
}; | |||||
static const char* SINGLE[] = { | |||||
"br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL | |||||
}; | |||||
static const char* SINGLE_ONLY[] = { | |||||
"br", "hr", "meta", "link", "img", NULL | |||||
}; | |||||
/* | |||||
Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in | |||||
the second argument. The caller must free the return value when finished. | |||||
If the return value is NULL, the conversion failed and *string is not set. | |||||
*/ | |||||
static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string) | |||||
{ | |||||
PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes; | |||||
if (!lower) | |||||
return NULL; | |||||
bytes = PyUnicode_AsASCIIString(lower); | |||||
Py_DECREF(lower); | |||||
if (!bytes) | |||||
return NULL; | |||||
*string = PyBytes_AS_STRING(bytes); | |||||
return bytes; | |||||
} | |||||
/* | |||||
Return whether a PyUnicodeObject is in a list of lowercase ASCII strings. | |||||
*/ | |||||
static int unicode_in_string_list(PyObject *input, const char **list) | |||||
{ | |||||
const char *string; | |||||
PyObject *temp = unicode_to_lcase_ascii(input, &string); | |||||
if (!temp) | |||||
return 0; | |||||
int retval = 0; | |||||
while (*list) { | |||||
if (!strcmp(*(list++), string)) { | |||||
retval = 1; | |||||
goto end; | |||||
} | |||||
} | |||||
end: | |||||
Py_DECREF(temp); | |||||
return retval; | |||||
} | |||||
/* | |||||
Return if the given tag's contents should be passed to the parser. | |||||
*/ | |||||
int is_parsable(PyObject *tag) | |||||
{ | |||||
return !unicode_in_string_list(tag, PARSER_BLACKLIST); | |||||
} | |||||
/* | |||||
Return whether or not the given tag can exist without a close tag. | |||||
*/ | |||||
int is_single(PyObject *tag) | |||||
{ | |||||
return unicode_in_string_list(tag, SINGLE); | |||||
} | |||||
/* | |||||
Return whether or not the given tag must exist without a close tag. | |||||
*/ | |||||
int is_single_only(PyObject *tag) | |||||
{ | |||||
return unicode_in_string_list(tag, SINGLE_ONLY); | |||||
} | |||||
/* | |||||
Return whether the given scheme is valid for external links. | |||||
*/ | |||||
int is_scheme(PyObject *scheme, int slashes) | |||||
{ | |||||
if (slashes) | |||||
return unicode_in_string_list(scheme, URI_SCHEMES); | |||||
else | |||||
return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL); | |||||
} |
@@ -0,0 +1,39 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#include "common.h" | |||||
/* This file should be kept up to date with mwparserfromhell/definitions.py. */ | |||||
/* Functions */ | |||||
int is_parsable(PyObject*); | |||||
int is_single(PyObject*); | |||||
int is_single_only(PyObject*); | |||||
int is_scheme(PyObject*, int); | |||||
/* Macros */ | |||||
#define GET_HTML_TAG(markup) \ | |||||
(markup == ':' ? "dd" : markup == ';' ? "dt" : "li") |
@@ -22,6 +22,7 @@ SOFTWARE. | |||||
#include "tok_parse.h" | #include "tok_parse.h" | ||||
#include "contexts.h" | #include "contexts.h" | ||||
#include "definitions.h" | |||||
#include "tag_data.h" | #include "tag_data.h" | ||||
#include "tok_support.h" | #include "tok_support.h" | ||||
#include "tokens.h" | #include "tokens.h" | ||||
@@ -33,13 +34,6 @@ SOFTWARE. | |||||
#define MAX_BRACES 255 | #define MAX_BRACES 255 | ||||
#define MAX_ENTITY_SIZE 8 | #define MAX_ENTITY_SIZE 8 | ||||
#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") | |||||
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) | |||||
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) | |||||
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) | |||||
#define IS_SCHEME(scheme, slashes) \ | |||||
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False)) | |||||
typedef struct { | typedef struct { | ||||
PyObject* title; | PyObject* title; | ||||
int level; | int level; | ||||
@@ -82,21 +76,6 @@ static int heading_level_from_context(uint64_t n) | |||||
} | } | ||||
/* | /* | ||||
Call the given function in definitions.py, using 'in1' and 'in2' as | |||||
parameters, and return its output as a bool. | |||||
*/ | |||||
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) | |||||
{ | |||||
PyObject* func = PyObject_GetAttrString(definitions, funcname); | |||||
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); | |||||
int ans = (result == Py_True) ? 1 : 0; | |||||
Py_DECREF(func); | |||||
Py_DECREF(result); | |||||
return ans; | |||||
} | |||||
/* | |||||
Sanitize the name of a tag so it can be compared with others for equality. | Sanitize the name of a tag so it can be compared with others for equality. | ||||
*/ | */ | ||||
static PyObject* strip_tag_name(PyObject* token, int take_attr) | static PyObject* strip_tag_name(PyObject* token, int take_attr) | ||||
@@ -516,7 +495,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||||
Textbuffer_dealloc(buffer); | Textbuffer_dealloc(buffer); | ||||
if (!scheme) | if (!scheme) | ||||
return -1; | return -1; | ||||
if (!IS_SCHEME(scheme, slashes)) { | |||||
if (!is_scheme(scheme, slashes)) { | |||||
Py_DECREF(scheme); | Py_DECREF(scheme); | ||||
Tokenizer_fail_route(self); | Tokenizer_fail_route(self); | ||||
return 0; | return 0; | ||||
@@ -565,7 +544,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||||
} | } | ||||
slashes = (Tokenizer_read(self, 0) == '/' && | slashes = (Tokenizer_read(self, 0) == '/' && | ||||
Tokenizer_read(self, 1) == '/'); | Tokenizer_read(self, 1) == '/'); | ||||
if (!IS_SCHEME(scheme, slashes)) { | |||||
if (!is_scheme(scheme, slashes)) { | |||||
Py_DECREF(scheme); | Py_DECREF(scheme); | ||||
Textbuffer_dealloc(scheme_buffer); | Textbuffer_dealloc(scheme_buffer); | ||||
FAIL_ROUTE(0); | FAIL_ROUTE(0); | ||||
@@ -1634,11 +1613,11 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) | |||||
text = PyObject_GetAttrString(token, "text"); | text = PyObject_GetAttrString(token, "text"); | ||||
if (!text) | if (!text) | ||||
return NULL; | return NULL; | ||||
if (IS_SINGLE_ONLY(text)) { | |||||
if (is_single_only(text)) { | |||||
Py_DECREF(text); | Py_DECREF(text); | ||||
return Tokenizer_handle_single_only_tag_end(self); | return Tokenizer_handle_single_only_tag_end(self); | ||||
} | } | ||||
if (IS_PARSABLE(text)) { | |||||
if (is_parsable(text)) { | |||||
Py_DECREF(text); | Py_DECREF(text); | ||||
return Tokenizer_parse(self, 0, 0); | return Tokenizer_parse(self, 0, 0); | ||||
} | } | ||||
@@ -1686,7 +1665,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) | |||||
Textbuffer_dealloc(buf); | Textbuffer_dealloc(buf); | ||||
return -1; | return -1; | ||||
} | } | ||||
if (!IS_SINGLE_ONLY(name)) | |||||
if (!is_single_only(name)) | |||||
FAIL_ROUTE(0); | FAIL_ROUTE(0); | ||||
Py_DECREF(name); | Py_DECREF(name); | ||||
break; | break; | ||||
@@ -2428,7 +2407,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) | |||||
text = PyObject_GetAttrString(token, "text"); | text = PyObject_GetAttrString(token, "text"); | ||||
if (!text) | if (!text) | ||||
return NULL; | return NULL; | ||||
single = IS_SINGLE(text); | |||||
single = is_single(text); | |||||
Py_DECREF(text); | Py_DECREF(text); | ||||
if (single) | if (single) | ||||
return Tokenizer_handle_single_tag_end(self); | return Tokenizer_handle_single_tag_end(self); | ||||
@@ -162,11 +162,12 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
self->skip_style_tags = skip_style_tags; | self->skip_style_tags = skip_style_tags; | ||||
tokens = Tokenizer_parse(self, context, 1); | tokens = Tokenizer_parse(self, context, 1); | ||||
if ((!tokens && !PyErr_Occurred()) || self->topstack) { | |||||
if (!ParserError) { | |||||
if (load_exceptions()) | |||||
return NULL; | |||||
} | |||||
if (!tokens || self->topstack) { | |||||
Py_XDECREF(tokens); | |||||
if (PyErr_Occurred()) | |||||
return NULL; | |||||
if (!ParserError && load_exceptions() < 0) | |||||
return NULL; | |||||
if (BAD_ROUTE) { | if (BAD_ROUTE) { | ||||
RESET_ROUTE(); | RESET_ROUTE(); | ||||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | ||||