@@ -13,6 +13,7 @@ v0.4 (unreleased): | |||
- Given the frequency of issues with the (admittedly insufficient) tag parser, | |||
there's a temporary skip_style_tags argument to parse() that ignores '' and | |||
''' until these issues are corrected. | |||
- Fixed a parser bug involving nested wikilinks and external links. | |||
- C code cleanup and speed improvements. | |||
v0.3.2 (released September 1, 2013): | |||
@@ -24,6 +24,7 @@ Unreleased | |||
there's a temporary *skip_style_tags* argument to | |||
:py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until | |||
these issues are corrected. | |||
- Fixed a parser bug involving nested wikilinks and external links. | |||
- C code cleanup and speed improvements. | |||
v0.3.2 | |||
@@ -55,7 +55,6 @@ Local (stack-specific) contexts: | |||
* :py:const:`EXT_LINK_URI` | |||
* :py:const:`EXT_LINK_TITLE` | |||
* :py:const:`EXT_LINK_BRACKETS` | |||
* :py:const:`HEADING` | |||
@@ -100,7 +99,8 @@ Aggregate contexts: | |||
* :py:const:`FAIL` | |||
* :py:const:`UNSAFE` | |||
* :py:const:`DOUBLE` | |||
* :py:const:`INVALID_LINK` | |||
* :py:const:`NO_WIKILINKS` | |||
* :py:const:`NO_EXT_LINKS` | |||
""" | |||
@@ -121,38 +121,37 @@ WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT | |||
EXT_LINK_URI = 1 << 7 | |||
EXT_LINK_TITLE = 1 << 8 | |||
EXT_LINK_BRACKETS = 1 << 9 | |||
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS | |||
HEADING_LEVEL_1 = 1 << 10 | |||
HEADING_LEVEL_2 = 1 << 11 | |||
HEADING_LEVEL_3 = 1 << 12 | |||
HEADING_LEVEL_4 = 1 << 13 | |||
HEADING_LEVEL_5 = 1 << 14 | |||
HEADING_LEVEL_6 = 1 << 15 | |||
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE | |||
HEADING_LEVEL_1 = 1 << 9 | |||
HEADING_LEVEL_2 = 1 << 10 | |||
HEADING_LEVEL_3 = 1 << 11 | |||
HEADING_LEVEL_4 = 1 << 12 | |||
HEADING_LEVEL_5 = 1 << 13 | |||
HEADING_LEVEL_6 = 1 << 14 | |||
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + | |||
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) | |||
TAG_OPEN = 1 << 16 | |||
TAG_ATTR = 1 << 17 | |||
TAG_BODY = 1 << 18 | |||
TAG_CLOSE = 1 << 19 | |||
TAG_OPEN = 1 << 15 | |||
TAG_ATTR = 1 << 16 | |||
TAG_BODY = 1 << 17 | |||
TAG_CLOSE = 1 << 18 | |||
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | |||
STYLE_ITALICS = 1 << 20 | |||
STYLE_BOLD = 1 << 21 | |||
STYLE_PASS_AGAIN = 1 << 22 | |||
STYLE_SECOND_PASS = 1 << 23 | |||
STYLE_ITALICS = 1 << 19 | |||
STYLE_BOLD = 1 << 20 | |||
STYLE_PASS_AGAIN = 1 << 21 | |||
STYLE_SECOND_PASS = 1 << 22 | |||
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS | |||
DL_TERM = 1 << 24 | |||
DL_TERM = 1 << 23 | |||
HAS_TEXT = 1 << 25 | |||
FAIL_ON_TEXT = 1 << 26 | |||
FAIL_NEXT = 1 << 27 | |||
FAIL_ON_LBRACE = 1 << 28 | |||
FAIL_ON_RBRACE = 1 << 29 | |||
FAIL_ON_EQUALS = 1 << 30 | |||
HAS_TEXT = 1 << 24 | |||
FAIL_ON_TEXT = 1 << 25 | |||
FAIL_NEXT = 1 << 26 | |||
FAIL_ON_LBRACE = 1 << 27 | |||
FAIL_ON_RBRACE = 1 << 28 | |||
FAIL_ON_EQUALS = 1 << 29 | |||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | |||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | |||
@@ -163,7 +162,8 @@ GL_HEADING = 1 << 0 | |||
# Aggregate contexts: | |||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE | |||
UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + | |||
ARGUMENT_NAME + TAG_CLOSE) | |||
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + | |||
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) | |||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | |||
INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK | |||
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI | |||
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK |
@@ -1158,7 +1158,7 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) | |||
*/ | |||
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | |||
{ | |||
#define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK | |||
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS | |||
#define NOT_A_LINK \ | |||
if (!brackets && self->topstack->context & LC_DLTERM) \ | |||
return Tokenizer_handle_dl_term(self); \ | |||
@@ -2440,10 +2440,8 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||
{ | |||
if (context & LC_FAIL_NEXT) | |||
return -1; | |||
if (context & LC_WIKILINK) { | |||
if (context & LC_WIKILINK_TEXT) | |||
return (data == '[' && Tokenizer_READ(self, 1) == '[') ? -1 : 0; | |||
else if (data == ']' || data == '{') | |||
if (context & LC_WIKILINK_TITLE) { | |||
if (data == ']' || data == '{') | |||
self->topstack->context |= LC_FAIL_NEXT; | |||
else if (data == '\n' || data == '[' || data == '}') | |||
return -1; | |||
@@ -2577,7 +2575,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
return NULL; | |||
} | |||
else if (this == next && next == '[' && Tokenizer_CAN_RECURSE(self)) { | |||
if (!(this_context & AGG_INVALID_LINK)) { | |||
if (!(this_context & AGG_NO_WIKILINKS)) { | |||
if (Tokenizer_parse_wikilink(self)) | |||
return NULL; | |||
} | |||
@@ -121,40 +121,39 @@ static PyObject* TagCloseClose; | |||
#define LC_WIKILINK_TITLE 0x00000020 | |||
#define LC_WIKILINK_TEXT 0x00000040 | |||
#define LC_EXT_LINK 0x00000380 | |||
#define LC_EXT_LINK 0x00000180 | |||
#define LC_EXT_LINK_URI 0x00000080 | |||
#define LC_EXT_LINK_TITLE 0x00000100 | |||
#define LC_EXT_LINK_BRACKETS 0x00000200 | |||
#define LC_HEADING 0x0000FC00 | |||
#define LC_HEADING_LEVEL_1 0x00000400 | |||
#define LC_HEADING_LEVEL_2 0x00000800 | |||
#define LC_HEADING_LEVEL_3 0x00001000 | |||
#define LC_HEADING_LEVEL_4 0x00002000 | |||
#define LC_HEADING_LEVEL_5 0x00004000 | |||
#define LC_HEADING_LEVEL_6 0x00008000 | |||
#define LC_TAG 0x000F0000 | |||
#define LC_TAG_OPEN 0x00010000 | |||
#define LC_TAG_ATTR 0x00020000 | |||
#define LC_TAG_BODY 0x00040000 | |||
#define LC_TAG_CLOSE 0x00080000 | |||
#define LC_STYLE 0x00F00000 | |||
#define LC_STYLE_ITALICS 0x00100000 | |||
#define LC_STYLE_BOLD 0x00200000 | |||
#define LC_STYLE_PASS_AGAIN 0x00400000 | |||
#define LC_STYLE_SECOND_PASS 0x00800000 | |||
#define LC_DLTERM 0x01000000 | |||
#define LC_SAFETY_CHECK 0x7E000000 | |||
#define LC_HAS_TEXT 0x02000000 | |||
#define LC_FAIL_ON_TEXT 0x04000000 | |||
#define LC_FAIL_NEXT 0x08000000 | |||
#define LC_FAIL_ON_LBRACE 0x10000000 | |||
#define LC_FAIL_ON_RBRACE 0x20000000 | |||
#define LC_FAIL_ON_EQUALS 0x40000000 | |||
#define LC_HEADING 0x00007E00 | |||
#define LC_HEADING_LEVEL_1 0x00000200 | |||
#define LC_HEADING_LEVEL_2 0x00000400 | |||
#define LC_HEADING_LEVEL_3 0x00000800 | |||
#define LC_HEADING_LEVEL_4 0x00001000 | |||
#define LC_HEADING_LEVEL_5 0x00002000 | |||
#define LC_HEADING_LEVEL_6 0x00004000 | |||
#define LC_TAG 0x00078000 | |||
#define LC_TAG_OPEN 0x00008000 | |||
#define LC_TAG_ATTR 0x00010000 | |||
#define LC_TAG_BODY 0x00020000 | |||
#define LC_TAG_CLOSE 0x00040000 | |||
#define LC_STYLE 0x00780000 | |||
#define LC_STYLE_ITALICS 0x00080000 | |||
#define LC_STYLE_BOLD 0x00100000 | |||
#define LC_STYLE_PASS_AGAIN 0x00200000 | |||
#define LC_STYLE_SECOND_PASS 0x00400000 | |||
#define LC_DLTERM 0x00800000 | |||
#define LC_SAFETY_CHECK 0x3F000000 | |||
#define LC_HAS_TEXT 0x01000000 | |||
#define LC_FAIL_ON_TEXT 0x02000000 | |||
#define LC_FAIL_NEXT 0x04000000 | |||
#define LC_FAIL_ON_LBRACE 0x08000000 | |||
#define LC_FAIL_ON_RBRACE 0x10000000 | |||
#define LC_FAIL_ON_EQUALS 0x20000000 | |||
/* Global contexts: */ | |||
@@ -163,9 +162,10 @@ static PyObject* TagCloseClose; | |||
/* Aggregate contexts: */ | |||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) | |||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | |||
#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) | |||
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | |||
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | |||
/* Tag contexts: */ | |||
@@ -467,7 +467,7 @@ class Tokenizer(object): | |||
reset = self._head | |||
self._head += 1 | |||
try: | |||
bad_context = self._context & contexts.INVALID_LINK | |||
bad_context = self._context & contexts.NO_EXT_LINKS | |||
if bad_context or not self._can_recurse(): | |||
raise BadRoute() | |||
link, extra, delta = self._really_parse_external_link(brackets) | |||
@@ -990,10 +990,8 @@ class Tokenizer(object): | |||
context = self._context | |||
if context & contexts.FAIL_NEXT: | |||
return False | |||
if context & contexts.WIKILINK: | |||
if context & contexts.WIKILINK_TEXT: | |||
return not (this == self._read(1) == "[") | |||
elif this == "]" or this == "{": | |||
if context & contexts.WIKILINK_TITLE: | |||
if this == "]" or this == "{": | |||
self._context |= contexts.FAIL_NEXT | |||
elif this == "\n" or this == "[" or this == "}": | |||
return False | |||
@@ -1083,7 +1081,7 @@ class Tokenizer(object): | |||
else: | |||
self._emit_text("}") | |||
elif this == next == "[" and self._can_recurse(): | |||
if not self._context & contexts.INVALID_LINK: | |||
if not self._context & contexts.NO_WIKILINKS: | |||
self._parse_wikilink() | |||
else: | |||
self._emit_text("[") | |||
@@ -150,3 +150,31 @@ name: comment_inside_bracketed_link | |||
label: an HTML comment inside a bracketed external link | |||
input: "[http://example.com/foo<!--comment-->bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()] | |||
--- | |||
name: wikilink_inside_external_link | |||
label: a wikilink inside an external link, which the parser considers valid (see issue #61) | |||
input: "[http://example.com/foo Foo [[Bar]]]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), ExternalLinkSeparator(), Text(text="Foo "), WikilinkOpen(), Text(text="Bar"), WikilinkClose(), ExternalLinkClose()] | |||
--- | |||
name: external_link_inside_wikilink | |||
label: an external link inside a wikilink, valid in the case of images (see issue #62) | |||
input: "[[File:Example.png|thumb|http://example.com]]" | |||
output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Text(text="thumb|"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), WikilinkClose()] | |||
--- | |||
name: external_link_inside_wikilink_brackets | |||
label: an external link with brackets inside a wikilink | |||
input: "[[File:Example.png|thumb|[http://example.com Example]]]" | |||
output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Text(text="thumb|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose(), WikilinkClose()] | |||
--- | |||
name: external_link_inside_wikilink_title | |||
label: an external link inside a wikilink title, which is invalid | |||
input: "[[File:Example.png http://example.com]]" | |||
output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()] |
@@ -54,6 +54,20 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[b | |||
--- | |||
name: nested | |||
label: a wikilink nested within another | |||
input: "[[foo|[[bar]]]]" | |||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] | |||
--- | |||
name: nested_padding | |||
label: a wikilink nested within another, separated by other data | |||
input: "[[foo|a[[b]]c]]" | |||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] | |||
--- | |||
name: invalid_newline | |||
label: invalid wikilink: newline as only content | |||
input: "[[\n]]" | |||
@@ -103,27 +117,13 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), | |||
--- | |||
name: invalid_nested_text | |||
label: invalid wikilink: a wikilink nested within the value of another | |||
name: invalid_nested_no_close | |||
label: invalid wikilink: a wikilink nested within the value of another, missing a pair of closing brackets | |||
input: "[[foo|[[bar]]" | |||
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] | |||
--- | |||
name: invalid_nested_text_2 | |||
label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets | |||
input: "[[foo|[[bar]]]]" | |||
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] | |||
name: invalid_nested_text_padding | |||
label: invalid wikilink: a wikilink nested within the value of another, separated by other data | |||
input: "[[foo|a[[b]]c]]" | |||
output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] | |||
name: incomplete_open_only | |||
label: incomplete wikilinks: just an open | |||
input: "[[" | |||