@@ -5,9 +5,10 @@ python: | |||||
- 3.2 | - 3.2 | ||||
- 3.3 | - 3.3 | ||||
- 3.4 | - 3.4 | ||||
- 3.5-dev | |||||
- 3.5 | |||||
sudo: false | sudo: false | ||||
install: | install: | ||||
- if [[ $TRAVIS_PYTHON_VERSION == '3.2' ]]; then pip install coverage==3.7.1; fi | |||||
- pip install coveralls | - pip install coveralls | ||||
- python setup.py build | - python setup.py build | ||||
script: | script: | ||||
@@ -1,3 +1,9 @@ | |||||
v0.4.3 (released October 29, 2015): | |||||
- Added Windows binaries for Python 3.5. | |||||
- Fixed edge cases involving wikilinks inside of external links and vice versa. | |||||
- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing. | |||||
v0.4.2 (released July 30, 2015): | v0.4.2 (released July 30, 2015): | ||||
- Fixed setup script not including header files in releases. | - Fixed setup script not including header files in releases. | ||||
@@ -1,10 +1,11 @@ | |||||
# This config file is used by appveyor.com to build Windows release binaries | # This config file is used by appveyor.com to build Windows release binaries | ||||
version: 0.4.2-b{build} | |||||
version: 0.4.3-b{build} | |||||
branches: | branches: | ||||
only: | only: | ||||
- master | - master | ||||
- develop | |||||
skip_tags: true | skip_tags: true | ||||
@@ -44,7 +45,16 @@ environment: | |||||
PYTHON_VERSION: "3.4" | PYTHON_VERSION: "3.4" | ||||
PYTHON_ARCH: "64" | PYTHON_ARCH: "64" | ||||
- PYTHON: "C:\\Python35" | |||||
PYTHON_VERSION: "3.5" | |||||
PYTHON_ARCH: "32" | |||||
- PYTHON: "C:\\Python35-x64" | |||||
PYTHON_VERSION: "3.5" | |||||
PYTHON_ARCH: "64" | |||||
install: | install: | ||||
- "%PIP% install --disable-pip-version-check --user --upgrade pip" | |||||
- "%PIP% install wheel twine" | - "%PIP% install wheel twine" | ||||
build_script: | build_script: | ||||
@@ -57,7 +67,7 @@ after_test: | |||||
- "%SETUPPY% bdist_wheel" | - "%SETUPPY% bdist_wheel" | ||||
on_success: | on_success: | ||||
- "%PYMOD% twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%" | |||||
- "IF %APPVEYOR_REPO_BRANCH%==master %PYMOD% twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%" | |||||
artifacts: | artifacts: | ||||
- path: dist\* | - path: dist\* | ||||
@@ -1,6 +1,16 @@ | |||||
Changelog | Changelog | ||||
========= | ========= | ||||
v0.4.3 | |||||
------ | |||||
`Released October 29, 2015 <https://github.com/earwig/mwparserfromhell/tree/v0.4.3>`_ | |||||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4.2...v0.4.3>`__): | |||||
- Added Windows binaries for Python 3.5. | |||||
- Fixed edge cases involving wikilinks inside of external links and vice versa. | |||||
- Fixed a C tokenizer crash when a keyboard interrupt happens while parsing. | |||||
v0.4.2 | v0.4.2 | ||||
------ | ------ | ||||
@@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode. | |||||
__author__ = "Ben Kurtovic" | __author__ = "Ben Kurtovic" | ||||
__copyright__ = "Copyright (C) 2012, 2013, 2014, 2015 Ben Kurtovic" | __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015 Ben Kurtovic" | ||||
__license__ = "MIT License" | __license__ = "MIT License" | ||||
__version__ = "0.4.2" | |||||
__version__ = "0.4.3" | |||||
__email__ = "ben.kurtovic@gmail.com" | __email__ = "ben.kurtovic@gmail.com" | ||||
from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | ||||
@@ -20,7 +20,13 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
"""Contains data about certain markup, like HTML tags and external links.""" | |||||
""" | |||||
Contains data about certain markup, like HTML tags and external links. | |||||
When updating this file, please also update the the C tokenizer version: | |||||
- mwparserfromhell/parser/ctokenizer/definitions.c | |||||
- mwparserfromhell/parser/ctokenizer/definitions.h | |||||
""" | |||||
from __future__ import unicode_literals | from __future__ import unicode_literals | ||||
@@ -0,0 +1,134 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#include "definitions.h" | |||||
/* | |||||
This file should be kept up to date with mwparserfromhell/definitions.py. | |||||
See the Python version for data sources. | |||||
*/ | |||||
static const char* URI_SCHEMES[] = { | |||||
"http", "https", "ftp", "ftps", "ssh", "sftp", "irc", "ircs", "xmpp", | |||||
"sip", "sips", "gopher", "telnet", "nntp", "worldwind", "mailto", "tel", | |||||
"sms", "news", "svn", "git", "mms", "bitcoin", "magnet", "urn", "geo", NULL | |||||
}; | |||||
static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = { | |||||
"xmpp", "sip", "sips", "mailto", "tel", "sms", "news", "bitcoin", "magnet", | |||||
"urn", "geo", NULL | |||||
}; | |||||
static const char* PARSER_BLACKLIST[] = { | |||||
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", | |||||
"nowiki", "pre", "score", "section", "source", "syntaxhighlight", | |||||
"templatedata", "timeline", NULL | |||||
}; | |||||
static const char* SINGLE[] = { | |||||
"br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL | |||||
}; | |||||
static const char* SINGLE_ONLY[] = { | |||||
"br", "hr", "meta", "link", "img", NULL | |||||
}; | |||||
/* | |||||
Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in | |||||
the second argument. The caller must free the return value when finished. | |||||
If the return value is NULL, the conversion failed and *string is not set. | |||||
*/ | |||||
static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string) | |||||
{ | |||||
PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes; | |||||
if (!lower) | |||||
return NULL; | |||||
bytes = PyUnicode_AsASCIIString(lower); | |||||
Py_DECREF(lower); | |||||
if (!bytes) { | |||||
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) | |||||
PyErr_Clear(); | |||||
return NULL; | |||||
} | |||||
*string = PyBytes_AS_STRING(bytes); | |||||
return bytes; | |||||
} | |||||
/* | |||||
Return whether a PyUnicodeObject is in a list of lowercase ASCII strings. | |||||
*/ | |||||
static int unicode_in_string_list(PyObject *input, const char **list) | |||||
{ | |||||
const char *string; | |||||
PyObject *temp = unicode_to_lcase_ascii(input, &string); | |||||
int retval = 0; | |||||
if (!temp) | |||||
return 0; | |||||
while (*list) { | |||||
if (!strcmp(*(list++), string)) { | |||||
retval = 1; | |||||
goto end; | |||||
} | |||||
} | |||||
end: | |||||
Py_DECREF(temp); | |||||
return retval; | |||||
} | |||||
/* | |||||
Return if the given tag's contents should be passed to the parser. | |||||
*/ | |||||
int is_parsable(PyObject *tag) | |||||
{ | |||||
return !unicode_in_string_list(tag, PARSER_BLACKLIST); | |||||
} | |||||
/* | |||||
Return whether or not the given tag can exist without a close tag. | |||||
*/ | |||||
int is_single(PyObject *tag) | |||||
{ | |||||
return unicode_in_string_list(tag, SINGLE); | |||||
} | |||||
/* | |||||
Return whether or not the given tag must exist without a close tag. | |||||
*/ | |||||
int is_single_only(PyObject *tag) | |||||
{ | |||||
return unicode_in_string_list(tag, SINGLE_ONLY); | |||||
} | |||||
/* | |||||
Return whether the given scheme is valid for external links. | |||||
*/ | |||||
int is_scheme(PyObject *scheme, int slashes) | |||||
{ | |||||
if (slashes) | |||||
return unicode_in_string_list(scheme, URI_SCHEMES); | |||||
else | |||||
return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL); | |||||
} |
@@ -0,0 +1,39 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#include "common.h" | |||||
/* This file should be kept up to date with mwparserfromhell/definitions.py. */ | |||||
/* Functions */ | |||||
int is_parsable(PyObject*); | |||||
int is_single(PyObject*); | |||||
int is_single_only(PyObject*); | |||||
int is_scheme(PyObject*, int); | |||||
/* Macros */ | |||||
#define GET_HTML_TAG(markup) \ | |||||
(markup == ':' ? "dd" : markup == ';' ? "dt" : "li") |
@@ -22,6 +22,7 @@ SOFTWARE. | |||||
#include "tok_parse.h" | #include "tok_parse.h" | ||||
#include "contexts.h" | #include "contexts.h" | ||||
#include "definitions.h" | |||||
#include "tag_data.h" | #include "tag_data.h" | ||||
#include "tok_support.h" | #include "tok_support.h" | ||||
#include "tokens.h" | #include "tokens.h" | ||||
@@ -29,17 +30,11 @@ SOFTWARE. | |||||
#define DIGITS "0123456789" | #define DIGITS "0123456789" | ||||
#define HEXDIGITS "0123456789abcdefABCDEF" | #define HEXDIGITS "0123456789abcdefABCDEF" | ||||
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | #define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||||
#define URISCHEME "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
#define MAX_BRACES 255 | #define MAX_BRACES 255 | ||||
#define MAX_ENTITY_SIZE 8 | #define MAX_ENTITY_SIZE 8 | ||||
#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") | |||||
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) | |||||
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) | |||||
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) | |||||
#define IS_SCHEME(scheme, slashes) \ | |||||
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False)) | |||||
typedef struct { | typedef struct { | ||||
PyObject* title; | PyObject* title; | ||||
int level; | int level; | ||||
@@ -47,6 +42,8 @@ typedef struct { | |||||
/* Forward declarations */ | /* Forward declarations */ | ||||
static PyObject* Tokenizer_really_parse_external_link( | |||||
Tokenizer*, int, Textbuffer*); | |||||
static int Tokenizer_parse_entity(Tokenizer*); | static int Tokenizer_parse_entity(Tokenizer*); | ||||
static int Tokenizer_parse_comment(Tokenizer*); | static int Tokenizer_parse_comment(Tokenizer*); | ||||
static int Tokenizer_handle_dl_term(Tokenizer*); | static int Tokenizer_handle_dl_term(Tokenizer*); | ||||
@@ -80,21 +77,6 @@ static int heading_level_from_context(uint64_t n) | |||||
} | } | ||||
/* | /* | ||||
Call the given function in definitions.py, using 'in1' and 'in2' as | |||||
parameters, and return its output as a bool. | |||||
*/ | |||||
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) | |||||
{ | |||||
PyObject* func = PyObject_GetAttrString(definitions, funcname); | |||||
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); | |||||
int ans = (result == Py_True) ? 1 : 0; | |||||
Py_DECREF(func); | |||||
Py_DECREF(result); | |||||
return ans; | |||||
} | |||||
/* | |||||
Sanitize the name of a tag so it can be compared with others for equality. | Sanitize the name of a tag so it can be compared with others for equality. | ||||
*/ | */ | ||||
static PyObject* strip_tag_name(PyObject* token, int take_attr) | static PyObject* strip_tag_name(PyObject* token, int take_attr) | ||||
@@ -362,30 +344,70 @@ static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) | |||||
static int Tokenizer_parse_wikilink(Tokenizer* self) | static int Tokenizer_parse_wikilink(Tokenizer* self) | ||||
{ | { | ||||
Py_ssize_t reset; | Py_ssize_t reset; | ||||
PyObject *wikilink; | |||||
PyObject *extlink, *wikilink, *kwargs; | |||||
reset = self->head + 1; | |||||
self->head += 2; | self->head += 2; | ||||
reset = self->head - 1; | |||||
wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1); | |||||
// If the wikilink looks like an external link, parse it as such: | |||||
extlink = Tokenizer_really_parse_external_link(self, 1, NULL); | |||||
if (BAD_ROUTE) { | if (BAD_ROUTE) { | ||||
RESET_ROUTE(); | RESET_ROUTE(); | ||||
self->head = reset + 1; | |||||
// Otherwise, actually parse it as a wikilink: | |||||
wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1); | |||||
if (BAD_ROUTE) { | |||||
RESET_ROUTE(); | |||||
self->head = reset; | |||||
if (Tokenizer_emit_text(self, "[[")) | |||||
return -1; | |||||
return 0; | |||||
} | |||||
if (!wikilink) | |||||
return -1; | |||||
if (Tokenizer_emit(self, WikilinkOpen)) { | |||||
Py_DECREF(wikilink); | |||||
return -1; | |||||
} | |||||
if (Tokenizer_emit_all(self, wikilink)) { | |||||
Py_DECREF(wikilink); | |||||
return -1; | |||||
} | |||||
Py_DECREF(wikilink); | |||||
if (Tokenizer_emit(self, WikilinkClose)) | |||||
return -1; | |||||
return 0; | |||||
} | |||||
if (!extlink) | |||||
return -1; | |||||
if (self->topstack->context & LC_EXT_LINK_TITLE) { | |||||
// In this exceptional case, an external link that looks like a | |||||
// wikilink inside of an external link is parsed as text: | |||||
Py_DECREF(extlink); | |||||
self->head = reset; | self->head = reset; | ||||
if (Tokenizer_emit_text(self, "[[")) | if (Tokenizer_emit_text(self, "[[")) | ||||
return -1; | return -1; | ||||
return 0; | return 0; | ||||
} | } | ||||
if (!wikilink) | |||||
if (Tokenizer_emit_text(self, "[")) { | |||||
Py_DECREF(extlink); | |||||
return -1; | return -1; | ||||
if (Tokenizer_emit(self, WikilinkOpen)) { | |||||
Py_DECREF(wikilink); | |||||
} | |||||
kwargs = PyDict_New(); | |||||
if (!kwargs) { | |||||
Py_DECREF(extlink); | |||||
return -1; | return -1; | ||||
} | } | ||||
if (Tokenizer_emit_all(self, wikilink)) { | |||||
Py_DECREF(wikilink); | |||||
PyDict_SetItemString(kwargs, "brackets", Py_True); | |||||
if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { | |||||
Py_DECREF(extlink); | |||||
return -1; | |||||
} | |||||
if (Tokenizer_emit_all(self, extlink)) { | |||||
Py_DECREF(extlink); | |||||
return -1; | return -1; | ||||
} | } | ||||
Py_DECREF(wikilink); | |||||
if (Tokenizer_emit(self, WikilinkClose)) | |||||
Py_DECREF(extlink); | |||||
if (Tokenizer_emit(self, ExternalLinkClose)) | |||||
return -1; | return -1; | ||||
return 0; | return 0; | ||||
} | } | ||||
@@ -417,7 +439,7 @@ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) | |||||
*/ | */ | ||||
static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | ||||
{ | { | ||||
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; | |||||
static const char* valid = URISCHEME; | |||||
Textbuffer* buffer; | Textbuffer* buffer; | ||||
PyObject* scheme; | PyObject* scheme; | ||||
Unicode this; | Unicode this; | ||||
@@ -474,7 +496,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||||
Textbuffer_dealloc(buffer); | Textbuffer_dealloc(buffer); | ||||
if (!scheme) | if (!scheme) | ||||
return -1; | return -1; | ||||
if (!IS_SCHEME(scheme, slashes)) { | |||||
if (!is_scheme(scheme, slashes)) { | |||||
Py_DECREF(scheme); | Py_DECREF(scheme); | ||||
Tokenizer_fail_route(self); | Tokenizer_fail_route(self); | ||||
return 0; | return 0; | ||||
@@ -489,7 +511,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||||
*/ | */ | ||||
static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | ||||
{ | { | ||||
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; | |||||
static const char* valid = URISCHEME; | |||||
Textbuffer *scheme_buffer = Textbuffer_new(&self->text); | Textbuffer *scheme_buffer = Textbuffer_new(&self->text); | ||||
PyObject *scheme; | PyObject *scheme; | ||||
Unicode chunk; | Unicode chunk; | ||||
@@ -523,7 +545,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||||
} | } | ||||
slashes = (Tokenizer_read(self, 0) == '/' && | slashes = (Tokenizer_read(self, 0) == '/' && | ||||
Tokenizer_read(self, 1) == '/'); | Tokenizer_read(self, 1) == '/'); | ||||
if (!IS_SCHEME(scheme, slashes)) { | |||||
if (!is_scheme(scheme, slashes)) { | |||||
Py_DECREF(scheme); | Py_DECREF(scheme); | ||||
Textbuffer_dealloc(scheme_buffer); | Textbuffer_dealloc(scheme_buffer); | ||||
FAIL_ROUTE(0); | FAIL_ROUTE(0); | ||||
@@ -553,7 +575,7 @@ static int Tokenizer_handle_free_link_text( | |||||
Tokenizer* self, int* parens, Textbuffer* tail, Unicode this) | Tokenizer* self, int* parens, Textbuffer* tail, Unicode this) | ||||
{ | { | ||||
#define PUSH_TAIL_BUFFER(tail, error) \ | #define PUSH_TAIL_BUFFER(tail, error) \ | ||||
if (tail->length > 0) { \ | |||||
if (tail && tail->length > 0) { \ | |||||
if (Textbuffer_concat(self->topstack->textbuffer, tail)) \ | if (Textbuffer_concat(self->topstack->textbuffer, tail)) \ | ||||
return error; \ | return error; \ | ||||
if (Textbuffer_reset(tail)) \ | if (Textbuffer_reset(tail)) \ | ||||
@@ -1592,11 +1614,11 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) | |||||
text = PyObject_GetAttrString(token, "text"); | text = PyObject_GetAttrString(token, "text"); | ||||
if (!text) | if (!text) | ||||
return NULL; | return NULL; | ||||
if (IS_SINGLE_ONLY(text)) { | |||||
if (is_single_only(text)) { | |||||
Py_DECREF(text); | Py_DECREF(text); | ||||
return Tokenizer_handle_single_only_tag_end(self); | return Tokenizer_handle_single_only_tag_end(self); | ||||
} | } | ||||
if (IS_PARSABLE(text)) { | |||||
if (is_parsable(text)) { | |||||
Py_DECREF(text); | Py_DECREF(text); | ||||
return Tokenizer_parse(self, 0, 0); | return Tokenizer_parse(self, 0, 0); | ||||
} | } | ||||
@@ -1644,7 +1666,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) | |||||
Textbuffer_dealloc(buf); | Textbuffer_dealloc(buf); | ||||
return -1; | return -1; | ||||
} | } | ||||
if (!IS_SINGLE_ONLY(name)) | |||||
if (!is_single_only(name)) | |||||
FAIL_ROUTE(0); | FAIL_ROUTE(0); | ||||
Py_DECREF(name); | Py_DECREF(name); | ||||
break; | break; | ||||
@@ -2108,7 +2130,7 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, | |||||
/* | /* | ||||
Handle style attributes for a table until an ending token. | Handle style attributes for a table until an ending token. | ||||
*/ | */ | ||||
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) | |||||
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token) | |||||
{ | { | ||||
TagData *data = TagData_new(&self->text); | TagData *data = TagData_new(&self->text); | ||||
PyObject *padding, *trash; | PyObject *padding, *trash; | ||||
@@ -2386,7 +2408,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) | |||||
text = PyObject_GetAttrString(token, "text"); | text = PyObject_GetAttrString(token, "text"); | ||||
if (!text) | if (!text) | ||||
return NULL; | return NULL; | ||||
single = IS_SINGLE(text); | |||||
single = is_single(text); | |||||
Py_DECREF(text); | Py_DECREF(text); | ||||
if (single) | if (single) | ||||
return Tokenizer_handle_single_tag_end(self); | return Tokenizer_handle_single_tag_end(self); | ||||
@@ -24,7 +24,7 @@ SOFTWARE. | |||||
#include "common.h" | #include "common.h" | ||||
static const char MARKERS[] = { | |||||
static const Unicode MARKERS[] = { | |||||
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', | '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', | ||||
'-', '!', '\n', '\0'}; | '-', '!', '\n', '\0'}; | ||||
@@ -162,11 +162,12 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
self->skip_style_tags = skip_style_tags; | self->skip_style_tags = skip_style_tags; | ||||
tokens = Tokenizer_parse(self, context, 1); | tokens = Tokenizer_parse(self, context, 1); | ||||
if ((!tokens && !PyErr_Occurred()) || self->topstack) { | |||||
if (!ParserError) { | |||||
if (load_exceptions()) | |||||
return NULL; | |||||
} | |||||
if (!tokens || self->topstack) { | |||||
Py_XDECREF(tokens); | |||||
if (PyErr_Occurred()) | |||||
return NULL; | |||||
if (!ParserError && load_exceptions() < 0) | |||||
return NULL; | |||||
if (BAD_ROUTE) { | if (BAD_ROUTE) { | ||||
RESET_ROUTE(); | RESET_ROUTE(); | ||||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | ||||
@@ -299,17 +299,34 @@ class Tokenizer(object): | |||||
def _parse_wikilink(self): | def _parse_wikilink(self): | ||||
"""Parse an internal wikilink at the head of the wikicode string.""" | """Parse an internal wikilink at the head of the wikicode string.""" | ||||
reset = self._head + 1 | |||||
self._head += 2 | self._head += 2 | ||||
reset = self._head - 1 | |||||
try: | try: | ||||
wikilink = self._parse(contexts.WIKILINK_TITLE) | |||||
# If the wikilink looks like an external link, parse it as such: | |||||
link, extra, delta = self._really_parse_external_link(True) | |||||
except BadRoute: | except BadRoute: | ||||
self._head = reset | |||||
self._emit_text("[[") | |||||
self._head = reset + 1 | |||||
try: | |||||
# Otherwise, actually parse it as a wikilink: | |||||
wikilink = self._parse(contexts.WIKILINK_TITLE) | |||||
except BadRoute: | |||||
self._head = reset | |||||
self._emit_text("[[") | |||||
else: | |||||
self._emit(tokens.WikilinkOpen()) | |||||
self._emit_all(wikilink) | |||||
self._emit(tokens.WikilinkClose()) | |||||
else: | else: | ||||
self._emit(tokens.WikilinkOpen()) | |||||
self._emit_all(wikilink) | |||||
self._emit(tokens.WikilinkClose()) | |||||
if self._context & contexts.EXT_LINK_TITLE: | |||||
# In this exceptional case, an external link that looks like a | |||||
# wikilink inside of an external link is parsed as text: | |||||
self._head = reset | |||||
self._emit_text("[[") | |||||
return | |||||
self._emit_text("[") | |||||
self._emit(tokens.ExternalLinkOpen(brackets=True)) | |||||
self._emit_all(link) | |||||
self._emit(tokens.ExternalLinkClose()) | |||||
def _handle_wikilink_separator(self): | def _handle_wikilink_separator(self): | ||||
"""Handle the separator between a wikilink's title and its text.""" | """Handle the separator between a wikilink's title and its text.""" | ||||
@@ -65,7 +65,7 @@ do_git_stuff() { | |||||
git commit -qam "release/$VERSION" | git commit -qam "release/$VERSION" | ||||
git tag v$VERSION -s -m "version $VERSION" | git tag v$VERSION -s -m "version $VERSION" | ||||
git checkout -q master | git checkout -q master | ||||
git merge -q --no-ff develop -m "Merge branch 'develop'" | |||||
git merge -q --no-ff develop -m "Merge develop into master (release/$VERSION)" | |||||
echo -n " pushing..." | echo -n " pushing..." | ||||
git push -q --tags origin master | git push -q --tags origin master | ||||
git checkout -q develop | git checkout -q develop | ||||
@@ -21,23 +21,35 @@ | |||||
SET COMMAND_TO_RUN=%* | SET COMMAND_TO_RUN=%* | ||||
SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows | ||||
SET WIN_WDK=c:\Program Files (x86)\Windows Kits\10\Include\wdf | |||||
SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%" | |||||
IF %MAJOR_PYTHON_VERSION% == "2" ( | |||||
SET MAJOR_PYTHON_VERSION=%PYTHON_VERSION:~0,1% | |||||
SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2% | |||||
IF %MAJOR_PYTHON_VERSION% == 2 ( | |||||
SET WINDOWS_SDK_VERSION="v7.0" | SET WINDOWS_SDK_VERSION="v7.0" | ||||
) ELSE IF %MAJOR_PYTHON_VERSION% == "3" ( | |||||
) ELSE IF %MAJOR_PYTHON_VERSION% == 3 ( | |||||
SET WINDOWS_SDK_VERSION="v7.1" | SET WINDOWS_SDK_VERSION="v7.1" | ||||
IF %MINOR_PYTHON_VERSION% GEQ 5 ( | |||||
SET NO_SET_SDK_64=Y | |||||
) | |||||
) ELSE ( | ) ELSE ( | ||||
ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" | ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" | ||||
EXIT 1 | EXIT 1 | ||||
) | ) | ||||
IF "%PYTHON_ARCH%"=="64" ( | |||||
IF "%PYTHON_ARCH%"=="32" ( | |||||
call %COMMAND_TO_RUN% || EXIT 1 | |||||
) ELSE IF "%NO_SET_SDK_64%"=="Y" ( | |||||
IF EXIST "%WIN_WDK%" ( | |||||
:: See: https://connect.microsoft.com/VisualStudio/feedback/details/1610302/ | |||||
REN "%WIN_WDK%" 0wdf | |||||
) | |||||
call %COMMAND_TO_RUN% || EXIT 1 | |||||
) ELSE ( | |||||
SET DISTUTILS_USE_SDK=1 | SET DISTUTILS_USE_SDK=1 | ||||
SET MSSdk=1 | SET MSSdk=1 | ||||
"%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% | ||||
"%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release | ||||
call %COMMAND_TO_RUN% || EXIT 1 | call %COMMAND_TO_RUN% || EXIT 1 | ||||
) ELSE ( | |||||
call %COMMAND_TO_RUN% || EXIT 1 | |||||
) | ) |
@@ -82,6 +82,13 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), Exter | |||||
--- | --- | ||||
name: brackets_recursive_2 | |||||
label: bracket-enclosed link with a double bracket-enclosed link as the title | |||||
input: "[http://example.com [[http://example.com]]]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[[http://example.com"), ExternalLinkClose(), Text(text="]]")] | |||||
--- | |||||
name: period_after | name: period_after | ||||
label: a period after a free link that is excluded | label: a period after a free link that is excluded | ||||
input: "http://example.com." | input: "http://example.com." | ||||
@@ -175,7 +175,7 @@ output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Tex | |||||
--- | --- | ||||
name: external_link_inside_wikilink_title | name: external_link_inside_wikilink_title | ||||
label: an external link inside a wikilink title, which is invalid | |||||
label: an external link inside a wikilink title, which is not parsed | |||||
input: "[[File:Example.png http://example.com]]" | input: "[[File:Example.png http://example.com]]" | ||||
output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()] | output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()] | ||||
@@ -318,3 +318,17 @@ name: incomplete_comment_in_link_title_6 | |||||
label: incomplete comments are invalid in link titles | label: incomplete comments are invalid in link titles | ||||
input: "[[foo<!--bar" | input: "[[foo<!--bar" | ||||
output: [Text(text="[[foo<!--bar")] | output: [Text(text="[[foo<!--bar")] | ||||
--- | |||||
name: wikilink_to_external_link_fallback | |||||
label: an external link enclosed in an extra pair of brackets (see issue #120) | |||||
input: "[[http://example.com foo bar]]" | |||||
output: [Text(text="["), ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="foo bar"), ExternalLinkClose(), Text(text="]")] | |||||
--- | |||||
name: wikilink_to_external_link_fallback_2 | |||||
label: an external link enclosed in an extra pair of brackets (see issue #120) | |||||
input: "[[http://example.com]]" | |||||
output: [Text(text="["), ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkClose(), Text(text="]")] |
@@ -632,3 +632,17 @@ name: unparsable_with_intermediates_normalize | |||||
label: an unparsable tag with intermediate tags inside of it, requiring normalization | label: an unparsable tag with intermediate tags inside of it, requiring normalization | ||||
input: "<nowiki><ref></ref></nowIKI >" | input: "<nowiki><ref></ref></nowIKI >" | ||||
output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="<ref></ref>"), TagOpenClose(), Text(text="nowIKI "), TagCloseClose()] | output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="<ref></ref>"), TagOpenClose(), Text(text="nowIKI "), TagCloseClose()] | ||||
--- | |||||
name: non_ascii_open | |||||
label: a open tag containing non-ASCII characters | |||||
input: "<éxamplé>" | |||||
output: [Text(text="<éxamplé>")] | |||||
--- | |||||
name: non_ascii_full | |||||
label: an open/close tag pair containing non-ASCII characters | |||||
input: "<éxamplé></éxamplé>" | |||||
output: [TagOpenOpen(), Text(text="éxamplé"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="éxamplé"), TagCloseClose()] |