@@ -1,6 +1,7 @@ | |||
v0.5 (unreleased): | |||
- Fixed edge cases involving wikilinks inside of external links and vice versa. | |||
- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing. | |||
v0.4.2 (released July 30, 2015): | |||
@@ -8,6 +8,7 @@ Unreleased | |||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4.2...develop>`__): | |||
- Fixed edge cases involving wikilinks inside of external links and vice versa. | |||
- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing. | |||
v0.4.2 | |||
------ | |||
@@ -20,7 +20,13 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
"""Contains data about certain markup, like HTML tags and external links.""" | |||
""" | |||
Contains data about certain markup, like HTML tags and external links. | |||
When updating this file, please also update the the C tokenizer version: | |||
- mwparserfromhell/parser/ctokenizer/definitions.c | |||
- mwparserfromhell/parser/ctokenizer/definitions.h | |||
""" | |||
from __future__ import unicode_literals | |||
@@ -0,0 +1,131 @@ | |||
/* | |||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
the Software without restriction, including without limitation the rights to | |||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||
of the Software, and to permit persons to whom the Software is furnished to do | |||
so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in all | |||
copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. | |||
*/ | |||
#include "definitions.h" | |||
/* | |||
This file should be kept up to date with mwparserfromhell/definitions.py. | |||
See the Python version for data sources. | |||
*/ | |||
static const char* URI_SCHEMES[] = { | |||
"http", "https", "ftp", "ftps", "ssh", "sftp", "irc", "ircs", "xmpp", | |||
"sip", "sips", "gopher", "telnet", "nntp", "worldwind", "mailto", "tel", | |||
"sms", "news", "svn", "git", "mms", "bitcoin", "magnet", "urn", "geo", NULL | |||
}; | |||
static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = { | |||
"xmpp", "sip", "sips", "mailto", "tel", "sms", "news", "bitcoin", "magnet", | |||
"urn", "geo", NULL | |||
}; | |||
static const char* PARSER_BLACKLIST[] = { | |||
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", | |||
"nowiki", "pre", "score", "section", "source", "syntaxhighlight", | |||
"templatedata", "timeline", NULL | |||
}; | |||
static const char* SINGLE[] = { | |||
"br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL | |||
}; | |||
static const char* SINGLE_ONLY[] = { | |||
"br", "hr", "meta", "link", "img", NULL | |||
}; | |||
/* | |||
Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in | |||
the second argument. The caller must free the return value when finished. | |||
If the return value is NULL, the conversion failed and *string is not set. | |||
*/ | |||
static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string) | |||
{ | |||
PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes; | |||
if (!lower) | |||
return NULL; | |||
bytes = PyUnicode_AsASCIIString(lower); | |||
Py_DECREF(lower); | |||
if (!bytes) | |||
return NULL; | |||
*string = PyBytes_AS_STRING(bytes); | |||
return bytes; | |||
} | |||
/* | |||
Return whether a PyUnicodeObject is in a list of lowercase ASCII strings. | |||
*/ | |||
static int unicode_in_string_list(PyObject *input, const char **list) | |||
{ | |||
const char *string; | |||
PyObject *temp = unicode_to_lcase_ascii(input, &string); | |||
if (!temp) | |||
return 0; | |||
int retval = 0; | |||
while (*list) { | |||
if (!strcmp(*(list++), string)) { | |||
retval = 1; | |||
goto end; | |||
} | |||
} | |||
end: | |||
Py_DECREF(temp); | |||
return retval; | |||
} | |||
/* | |||
Return if the given tag's contents should be passed to the parser. | |||
*/ | |||
int is_parsable(PyObject *tag) | |||
{ | |||
return !unicode_in_string_list(tag, PARSER_BLACKLIST); | |||
} | |||
/* | |||
Return whether or not the given tag can exist without a close tag. | |||
*/ | |||
int is_single(PyObject *tag) | |||
{ | |||
return unicode_in_string_list(tag, SINGLE); | |||
} | |||
/* | |||
Return whether or not the given tag must exist without a close tag. | |||
*/ | |||
int is_single_only(PyObject *tag) | |||
{ | |||
return unicode_in_string_list(tag, SINGLE_ONLY); | |||
} | |||
/* | |||
Return whether the given scheme is valid for external links. | |||
*/ | |||
int is_scheme(PyObject *scheme, int slashes) | |||
{ | |||
if (slashes) | |||
return unicode_in_string_list(scheme, URI_SCHEMES); | |||
else | |||
return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL); | |||
} |
@@ -0,0 +1,39 @@ | |||
/* | |||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
the Software without restriction, including without limitation the rights to | |||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||
of the Software, and to permit persons to whom the Software is furnished to do | |||
so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in all | |||
copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. | |||
*/ | |||
#pragma once | |||
#include "common.h" | |||
/* This file should be kept up to date with mwparserfromhell/definitions.py. */ | |||
/* Functions */ | |||
int is_parsable(PyObject*); | |||
int is_single(PyObject*); | |||
int is_single_only(PyObject*); | |||
int is_scheme(PyObject*, int); | |||
/* Macros */ | |||
#define GET_HTML_TAG(markup) \ | |||
(markup == ':' ? "dd" : markup == ';' ? "dt" : "li") |
@@ -22,6 +22,7 @@ SOFTWARE. | |||
#include "tok_parse.h" | |||
#include "contexts.h" | |||
#include "definitions.h" | |||
#include "tag_data.h" | |||
#include "tok_support.h" | |||
#include "tokens.h" | |||
@@ -33,13 +34,6 @@ SOFTWARE. | |||
#define MAX_BRACES 255 | |||
#define MAX_ENTITY_SIZE 8 | |||
#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") | |||
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) | |||
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) | |||
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) | |||
#define IS_SCHEME(scheme, slashes) \ | |||
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False)) | |||
typedef struct { | |||
PyObject* title; | |||
int level; | |||
@@ -82,21 +76,6 @@ static int heading_level_from_context(uint64_t n) | |||
} | |||
/* | |||
Call the given function in definitions.py, using 'in1' and 'in2' as | |||
parameters, and return its output as a bool. | |||
*/ | |||
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) | |||
{ | |||
PyObject* func = PyObject_GetAttrString(definitions, funcname); | |||
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); | |||
int ans = (result == Py_True) ? 1 : 0; | |||
Py_DECREF(func); | |||
Py_DECREF(result); | |||
return ans; | |||
} | |||
/* | |||
Sanitize the name of a tag so it can be compared with others for equality. | |||
*/ | |||
static PyObject* strip_tag_name(PyObject* token, int take_attr) | |||
@@ -516,7 +495,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||
Textbuffer_dealloc(buffer); | |||
if (!scheme) | |||
return -1; | |||
if (!IS_SCHEME(scheme, slashes)) { | |||
if (!is_scheme(scheme, slashes)) { | |||
Py_DECREF(scheme); | |||
Tokenizer_fail_route(self); | |||
return 0; | |||
@@ -565,7 +544,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
} | |||
slashes = (Tokenizer_read(self, 0) == '/' && | |||
Tokenizer_read(self, 1) == '/'); | |||
if (!IS_SCHEME(scheme, slashes)) { | |||
if (!is_scheme(scheme, slashes)) { | |||
Py_DECREF(scheme); | |||
Textbuffer_dealloc(scheme_buffer); | |||
FAIL_ROUTE(0); | |||
@@ -1634,11 +1613,11 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) | |||
text = PyObject_GetAttrString(token, "text"); | |||
if (!text) | |||
return NULL; | |||
if (IS_SINGLE_ONLY(text)) { | |||
if (is_single_only(text)) { | |||
Py_DECREF(text); | |||
return Tokenizer_handle_single_only_tag_end(self); | |||
} | |||
if (IS_PARSABLE(text)) { | |||
if (is_parsable(text)) { | |||
Py_DECREF(text); | |||
return Tokenizer_parse(self, 0, 0); | |||
} | |||
@@ -1686,7 +1665,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) | |||
Textbuffer_dealloc(buf); | |||
return -1; | |||
} | |||
if (!IS_SINGLE_ONLY(name)) | |||
if (!is_single_only(name)) | |||
FAIL_ROUTE(0); | |||
Py_DECREF(name); | |||
break; | |||
@@ -2428,7 +2407,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) | |||
text = PyObject_GetAttrString(token, "text"); | |||
if (!text) | |||
return NULL; | |||
single = IS_SINGLE(text); | |||
single = is_single(text); | |||
Py_DECREF(text); | |||
if (single) | |||
return Tokenizer_handle_single_tag_end(self); | |||
@@ -162,11 +162,12 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
self->skip_style_tags = skip_style_tags; | |||
tokens = Tokenizer_parse(self, context, 1); | |||
if ((!tokens && !PyErr_Occurred()) || self->topstack) { | |||
if (!ParserError) { | |||
if (load_exceptions()) | |||
return NULL; | |||
} | |||
if (!tokens || self->topstack) { | |||
Py_XDECREF(tokens); | |||
if (PyErr_Occurred()) | |||
return NULL; | |||
if (!ParserError && load_exceptions() < 0) | |||
return NULL; | |||
if (BAD_ROUTE) { | |||
RESET_ROUTE(); | |||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | |||