@@ -13,6 +13,7 @@ v0.4 (unreleased): | |||||
- Given the frequency of issues with the (admittedly insufficient) tag parser, | - Given the frequency of issues with the (admittedly insufficient) tag parser, | ||||
there's a temporary skip_style_tags argument to parse() that ignores '' and | there's a temporary skip_style_tags argument to parse() that ignores '' and | ||||
''' until these issues are corrected. | ''' until these issues are corrected. | ||||
- Fixed a parser bug involving nested wikilinks and external links. | |||||
- C code cleanup and speed improvements. | - C code cleanup and speed improvements. | ||||
v0.3.2 (released September 1, 2013): | v0.3.2 (released September 1, 2013): | ||||
@@ -24,6 +24,7 @@ Unreleased | |||||
there's a temporary *skip_style_tags* argument to | there's a temporary *skip_style_tags* argument to | ||||
:py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until | :py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until | ||||
these issues are corrected. | these issues are corrected. | ||||
- Fixed a parser bug involving nested wikilinks and external links. | |||||
- C code cleanup and speed improvements. | - C code cleanup and speed improvements. | ||||
v0.3.2 | v0.3.2 | ||||
@@ -55,7 +55,6 @@ Local (stack-specific) contexts: | |||||
* :py:const:`EXT_LINK_URI` | * :py:const:`EXT_LINK_URI` | ||||
* :py:const:`EXT_LINK_TITLE` | * :py:const:`EXT_LINK_TITLE` | ||||
* :py:const:`EXT_LINK_BRACKETS` | |||||
* :py:const:`HEADING` | * :py:const:`HEADING` | ||||
@@ -100,7 +99,8 @@ Aggregate contexts: | |||||
* :py:const:`FAIL` | * :py:const:`FAIL` | ||||
* :py:const:`UNSAFE` | * :py:const:`UNSAFE` | ||||
* :py:const:`DOUBLE` | * :py:const:`DOUBLE` | ||||
* :py:const:`INVALID_LINK` | |||||
* :py:const:`NO_WIKILINKS` | |||||
* :py:const:`NO_EXT_LINKS` | |||||
""" | """ | ||||
@@ -121,38 +121,37 @@ WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT | |||||
EXT_LINK_URI = 1 << 7 | EXT_LINK_URI = 1 << 7 | ||||
EXT_LINK_TITLE = 1 << 8 | EXT_LINK_TITLE = 1 << 8 | ||||
EXT_LINK_BRACKETS = 1 << 9 | |||||
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS | |||||
HEADING_LEVEL_1 = 1 << 10 | |||||
HEADING_LEVEL_2 = 1 << 11 | |||||
HEADING_LEVEL_3 = 1 << 12 | |||||
HEADING_LEVEL_4 = 1 << 13 | |||||
HEADING_LEVEL_5 = 1 << 14 | |||||
HEADING_LEVEL_6 = 1 << 15 | |||||
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE | |||||
HEADING_LEVEL_1 = 1 << 9 | |||||
HEADING_LEVEL_2 = 1 << 10 | |||||
HEADING_LEVEL_3 = 1 << 11 | |||||
HEADING_LEVEL_4 = 1 << 12 | |||||
HEADING_LEVEL_5 = 1 << 13 | |||||
HEADING_LEVEL_6 = 1 << 14 | |||||
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + | HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + | ||||
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) | HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) | ||||
TAG_OPEN = 1 << 16 | |||||
TAG_ATTR = 1 << 17 | |||||
TAG_BODY = 1 << 18 | |||||
TAG_CLOSE = 1 << 19 | |||||
TAG_OPEN = 1 << 15 | |||||
TAG_ATTR = 1 << 16 | |||||
TAG_BODY = 1 << 17 | |||||
TAG_CLOSE = 1 << 18 | |||||
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | ||||
STYLE_ITALICS = 1 << 20 | |||||
STYLE_BOLD = 1 << 21 | |||||
STYLE_PASS_AGAIN = 1 << 22 | |||||
STYLE_SECOND_PASS = 1 << 23 | |||||
STYLE_ITALICS = 1 << 19 | |||||
STYLE_BOLD = 1 << 20 | |||||
STYLE_PASS_AGAIN = 1 << 21 | |||||
STYLE_SECOND_PASS = 1 << 22 | |||||
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS | STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS | ||||
DL_TERM = 1 << 24 | |||||
DL_TERM = 1 << 23 | |||||
HAS_TEXT = 1 << 25 | |||||
FAIL_ON_TEXT = 1 << 26 | |||||
FAIL_NEXT = 1 << 27 | |||||
FAIL_ON_LBRACE = 1 << 28 | |||||
FAIL_ON_RBRACE = 1 << 29 | |||||
FAIL_ON_EQUALS = 1 << 30 | |||||
HAS_TEXT = 1 << 24 | |||||
FAIL_ON_TEXT = 1 << 25 | |||||
FAIL_NEXT = 1 << 26 | |||||
FAIL_ON_LBRACE = 1 << 27 | |||||
FAIL_ON_RBRACE = 1 << 28 | |||||
FAIL_ON_EQUALS = 1 << 29 | |||||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | ||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | FAIL_ON_RBRACE + FAIL_ON_EQUALS) | ||||
@@ -163,7 +162,8 @@ GL_HEADING = 1 << 0 | |||||
# Aggregate contexts: | # Aggregate contexts: | ||||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE | FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE | ||||
UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + | |||||
ARGUMENT_NAME + TAG_CLOSE) | |||||
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + | |||||
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) | |||||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | ||||
INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK | |||||
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI | |||||
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK |
@@ -1158,7 +1158,7 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) | |||||
*/ | */ | ||||
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | ||||
{ | { | ||||
#define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK | |||||
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS | |||||
#define NOT_A_LINK \ | #define NOT_A_LINK \ | ||||
if (!brackets && self->topstack->context & LC_DLTERM) \ | if (!brackets && self->topstack->context & LC_DLTERM) \ | ||||
return Tokenizer_handle_dl_term(self); \ | return Tokenizer_handle_dl_term(self); \ | ||||
@@ -2440,10 +2440,8 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||||
{ | { | ||||
if (context & LC_FAIL_NEXT) | if (context & LC_FAIL_NEXT) | ||||
return -1; | return -1; | ||||
if (context & LC_WIKILINK) { | |||||
if (context & LC_WIKILINK_TEXT) | |||||
return (data == '[' && Tokenizer_READ(self, 1) == '[') ? -1 : 0; | |||||
else if (data == ']' || data == '{') | |||||
if (context & LC_WIKILINK_TITLE) { | |||||
if (data == ']' || data == '{') | |||||
self->topstack->context |= LC_FAIL_NEXT; | self->topstack->context |= LC_FAIL_NEXT; | ||||
else if (data == '\n' || data == '[' || data == '}') | else if (data == '\n' || data == '[' || data == '}') | ||||
return -1; | return -1; | ||||
@@ -2577,7 +2575,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (this == next && next == '[' && Tokenizer_CAN_RECURSE(self)) { | else if (this == next && next == '[' && Tokenizer_CAN_RECURSE(self)) { | ||||
if (!(this_context & AGG_INVALID_LINK)) { | |||||
if (!(this_context & AGG_NO_WIKILINKS)) { | |||||
if (Tokenizer_parse_wikilink(self)) | if (Tokenizer_parse_wikilink(self)) | ||||
return NULL; | return NULL; | ||||
} | } | ||||
@@ -121,40 +121,39 @@ static PyObject* TagCloseClose; | |||||
#define LC_WIKILINK_TITLE 0x00000020 | #define LC_WIKILINK_TITLE 0x00000020 | ||||
#define LC_WIKILINK_TEXT 0x00000040 | #define LC_WIKILINK_TEXT 0x00000040 | ||||
#define LC_EXT_LINK 0x00000380 | |||||
#define LC_EXT_LINK 0x00000180 | |||||
#define LC_EXT_LINK_URI 0x00000080 | #define LC_EXT_LINK_URI 0x00000080 | ||||
#define LC_EXT_LINK_TITLE 0x00000100 | #define LC_EXT_LINK_TITLE 0x00000100 | ||||
#define LC_EXT_LINK_BRACKETS 0x00000200 | |||||
#define LC_HEADING 0x0000FC00 | |||||
#define LC_HEADING_LEVEL_1 0x00000400 | |||||
#define LC_HEADING_LEVEL_2 0x00000800 | |||||
#define LC_HEADING_LEVEL_3 0x00001000 | |||||
#define LC_HEADING_LEVEL_4 0x00002000 | |||||
#define LC_HEADING_LEVEL_5 0x00004000 | |||||
#define LC_HEADING_LEVEL_6 0x00008000 | |||||
#define LC_TAG 0x000F0000 | |||||
#define LC_TAG_OPEN 0x00010000 | |||||
#define LC_TAG_ATTR 0x00020000 | |||||
#define LC_TAG_BODY 0x00040000 | |||||
#define LC_TAG_CLOSE 0x00080000 | |||||
#define LC_STYLE 0x00F00000 | |||||
#define LC_STYLE_ITALICS 0x00100000 | |||||
#define LC_STYLE_BOLD 0x00200000 | |||||
#define LC_STYLE_PASS_AGAIN 0x00400000 | |||||
#define LC_STYLE_SECOND_PASS 0x00800000 | |||||
#define LC_DLTERM 0x01000000 | |||||
#define LC_SAFETY_CHECK 0x7E000000 | |||||
#define LC_HAS_TEXT 0x02000000 | |||||
#define LC_FAIL_ON_TEXT 0x04000000 | |||||
#define LC_FAIL_NEXT 0x08000000 | |||||
#define LC_FAIL_ON_LBRACE 0x10000000 | |||||
#define LC_FAIL_ON_RBRACE 0x20000000 | |||||
#define LC_FAIL_ON_EQUALS 0x40000000 | |||||
#define LC_HEADING 0x00007E00 | |||||
#define LC_HEADING_LEVEL_1 0x00000200 | |||||
#define LC_HEADING_LEVEL_2 0x00000400 | |||||
#define LC_HEADING_LEVEL_3 0x00000800 | |||||
#define LC_HEADING_LEVEL_4 0x00001000 | |||||
#define LC_HEADING_LEVEL_5 0x00002000 | |||||
#define LC_HEADING_LEVEL_6 0x00004000 | |||||
#define LC_TAG 0x00078000 | |||||
#define LC_TAG_OPEN 0x00008000 | |||||
#define LC_TAG_ATTR 0x00010000 | |||||
#define LC_TAG_BODY 0x00020000 | |||||
#define LC_TAG_CLOSE 0x00040000 | |||||
#define LC_STYLE 0x00780000 | |||||
#define LC_STYLE_ITALICS 0x00080000 | |||||
#define LC_STYLE_BOLD 0x00100000 | |||||
#define LC_STYLE_PASS_AGAIN 0x00200000 | |||||
#define LC_STYLE_SECOND_PASS 0x00400000 | |||||
#define LC_DLTERM 0x00800000 | |||||
#define LC_SAFETY_CHECK 0x3F000000 | |||||
#define LC_HAS_TEXT 0x01000000 | |||||
#define LC_FAIL_ON_TEXT 0x02000000 | |||||
#define LC_FAIL_NEXT 0x04000000 | |||||
#define LC_FAIL_ON_LBRACE 0x08000000 | |||||
#define LC_FAIL_ON_RBRACE 0x10000000 | |||||
#define LC_FAIL_ON_EQUALS 0x20000000 | |||||
/* Global contexts: */ | /* Global contexts: */ | ||||
@@ -163,9 +162,10 @@ static PyObject* TagCloseClose; | |||||
/* Aggregate contexts: */ | /* Aggregate contexts: */ | ||||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) | #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) | ||||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | ||||
#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) | |||||
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | |||||
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | |||||
/* Tag contexts: */ | /* Tag contexts: */ | ||||
@@ -467,7 +467,7 @@ class Tokenizer(object): | |||||
reset = self._head | reset = self._head | ||||
self._head += 1 | self._head += 1 | ||||
try: | try: | ||||
bad_context = self._context & contexts.INVALID_LINK | |||||
bad_context = self._context & contexts.NO_EXT_LINKS | |||||
if bad_context or not self._can_recurse(): | if bad_context or not self._can_recurse(): | ||||
raise BadRoute() | raise BadRoute() | ||||
link, extra, delta = self._really_parse_external_link(brackets) | link, extra, delta = self._really_parse_external_link(brackets) | ||||
@@ -990,10 +990,8 @@ class Tokenizer(object): | |||||
context = self._context | context = self._context | ||||
if context & contexts.FAIL_NEXT: | if context & contexts.FAIL_NEXT: | ||||
return False | return False | ||||
if context & contexts.WIKILINK: | |||||
if context & contexts.WIKILINK_TEXT: | |||||
return not (this == self._read(1) == "[") | |||||
elif this == "]" or this == "{": | |||||
if context & contexts.WIKILINK_TITLE: | |||||
if this == "]" or this == "{": | |||||
self._context |= contexts.FAIL_NEXT | self._context |= contexts.FAIL_NEXT | ||||
elif this == "\n" or this == "[" or this == "}": | elif this == "\n" or this == "[" or this == "}": | ||||
return False | return False | ||||
@@ -1083,7 +1081,7 @@ class Tokenizer(object): | |||||
else: | else: | ||||
self._emit_text("}") | self._emit_text("}") | ||||
elif this == next == "[" and self._can_recurse(): | elif this == next == "[" and self._can_recurse(): | ||||
if not self._context & contexts.INVALID_LINK: | |||||
if not self._context & contexts.NO_WIKILINKS: | |||||
self._parse_wikilink() | self._parse_wikilink() | ||||
else: | else: | ||||
self._emit_text("[") | self._emit_text("[") | ||||
@@ -150,3 +150,31 @@ name: comment_inside_bracketed_link | |||||
label: an HTML comment inside a bracketed external link | label: an HTML comment inside a bracketed external link | ||||
input: "[http://example.com/foo<!--comment-->bar]" | input: "[http://example.com/foo<!--comment-->bar]" | ||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()] | output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()] | ||||
--- | |||||
name: wikilink_inside_external_link | |||||
label: a wikilink inside an external link, which the parser considers valid (see issue #61) | |||||
input: "[http://example.com/foo Foo [[Bar]]]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), ExternalLinkSeparator(), Text(text="Foo "), WikilinkOpen(), Text(text="Bar"), WikilinkClose(), ExternalLinkClose()] | |||||
--- | |||||
name: external_link_inside_wikilink | |||||
label: an external link inside a wikilink, valid in the case of images (see issue #62) | |||||
input: "[[File:Example.png|thumb|http://example.com]]" | |||||
output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Text(text="thumb|"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), WikilinkClose()] | |||||
--- | |||||
name: external_link_inside_wikilink_brackets | |||||
label: an external link with brackets inside a wikilink | |||||
input: "[[File:Example.png|thumb|[http://example.com Example]]]" | |||||
output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Text(text="thumb|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose(), WikilinkClose()] | |||||
--- | |||||
name: external_link_inside_wikilink_title | |||||
label: an external link inside a wikilink title, which is invalid | |||||
input: "[[File:Example.png http://example.com]]" | |||||
output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()] |
@@ -54,6 +54,20 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[b | |||||
--- | --- | ||||
name: nested | |||||
label: a wikilink nested within another | |||||
input: "[[foo|[[bar]]]]" | |||||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] | |||||
--- | |||||
name: nested_padding | |||||
label: a wikilink nested within another, separated by other data | |||||
input: "[[foo|a[[b]]c]]" | |||||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] | |||||
--- | |||||
name: invalid_newline | name: invalid_newline | ||||
label: invalid wikilink: newline as only content | label: invalid wikilink: newline as only content | ||||
input: "[[\n]]" | input: "[[\n]]" | ||||
@@ -103,27 +117,13 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), | |||||
--- | --- | ||||
name: invalid_nested_text | |||||
label: invalid wikilink: a wikilink nested within the value of another | |||||
name: invalid_nested_no_close | |||||
label: invalid wikilink: a wikilink nested within the value of another, missing a pair of closing brackets | |||||
input: "[[foo|[[bar]]" | input: "[[foo|[[bar]]" | ||||
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] | output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] | ||||
--- | --- | ||||
name: invalid_nested_text_2 | |||||
label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets | |||||
input: "[[foo|[[bar]]]]" | |||||
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] | |||||
name: invalid_nested_text_padding | |||||
label: invalid wikilink: a wikilink nested within the value of another, separated by other data | |||||
input: "[[foo|a[[b]]c]]" | |||||
output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] | |||||
name: incomplete_open_only | name: incomplete_open_only | ||||
label: incomplete wikilinks: just an open | label: incomplete wikilinks: just an open | ||||
input: "[[" | input: "[[" | ||||