Browse Source

Fix a bug involving nested links (closes #61 and #62).

tags/v0.3.3
Ben Kurtovic 11 years ago
parent
commit
5c5fd6b3cb
8 changed files with 117 additions and 89 deletions
  1. +1
    -0
      CHANGELOG
  2. +1
    -0
      docs/changelog.rst
  3. +29
    -29
      mwparserfromhell/parser/contexts.py
  4. +4
    -6
      mwparserfromhell/parser/tokenizer.c
  5. +34
    -34
      mwparserfromhell/parser/tokenizer.h
  6. +4
    -6
      mwparserfromhell/parser/tokenizer.py
  7. +28
    -0
      tests/tokenizer/integration.mwtest
  8. +16
    -14
      tests/tokenizer/wikilinks.mwtest

+ 1
- 0
CHANGELOG View File

@@ -13,6 +13,7 @@ v0.4 (unreleased):
- Given the frequency of issues with the (admittedly insufficient) tag parser,
there's a temporary skip_style_tags argument to parse() that ignores '' and
''' until these issues are corrected.
- Fixed a parser bug involving nested wikilinks and external links.
- C code cleanup and speed improvements.

v0.3.2 (released September 1, 2013):


+ 1
- 0
docs/changelog.rst View File

@@ -24,6 +24,7 @@ Unreleased
there's a temporary *skip_style_tags* argument to
:py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until
these issues are corrected.
- Fixed a parser bug involving nested wikilinks and external links.
- C code cleanup and speed improvements.

v0.3.2


+ 29
- 29
mwparserfromhell/parser/contexts.py View File

@@ -55,7 +55,6 @@ Local (stack-specific) contexts:

* :py:const:`EXT_LINK_URI`
* :py:const:`EXT_LINK_TITLE`
* :py:const:`EXT_LINK_BRACKETS`

* :py:const:`HEADING`

@@ -100,7 +99,8 @@ Aggregate contexts:
* :py:const:`FAIL`
* :py:const:`UNSAFE`
* :py:const:`DOUBLE`
* :py:const:`INVALID_LINK`
* :py:const:`NO_WIKILINKS`
* :py:const:`NO_EXT_LINKS`

"""

@@ -121,38 +121,37 @@ WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT

EXT_LINK_URI = 1 << 7
EXT_LINK_TITLE = 1 << 8
EXT_LINK_BRACKETS = 1 << 9
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS

HEADING_LEVEL_1 = 1 << 10
HEADING_LEVEL_2 = 1 << 11
HEADING_LEVEL_3 = 1 << 12
HEADING_LEVEL_4 = 1 << 13
HEADING_LEVEL_5 = 1 << 14
HEADING_LEVEL_6 = 1 << 15
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE

HEADING_LEVEL_1 = 1 << 9
HEADING_LEVEL_2 = 1 << 10
HEADING_LEVEL_3 = 1 << 11
HEADING_LEVEL_4 = 1 << 12
HEADING_LEVEL_5 = 1 << 13
HEADING_LEVEL_6 = 1 << 14
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 +
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6)

TAG_OPEN = 1 << 16
TAG_ATTR = 1 << 17
TAG_BODY = 1 << 18
TAG_CLOSE = 1 << 19
TAG_OPEN = 1 << 15
TAG_ATTR = 1 << 16
TAG_BODY = 1 << 17
TAG_CLOSE = 1 << 18
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE

STYLE_ITALICS = 1 << 20
STYLE_BOLD = 1 << 21
STYLE_PASS_AGAIN = 1 << 22
STYLE_SECOND_PASS = 1 << 23
STYLE_ITALICS = 1 << 19
STYLE_BOLD = 1 << 20
STYLE_PASS_AGAIN = 1 << 21
STYLE_SECOND_PASS = 1 << 22
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS

DL_TERM = 1 << 24
DL_TERM = 1 << 23

HAS_TEXT = 1 << 25
FAIL_ON_TEXT = 1 << 26
FAIL_NEXT = 1 << 27
FAIL_ON_LBRACE = 1 << 28
FAIL_ON_RBRACE = 1 << 29
FAIL_ON_EQUALS = 1 << 30
HAS_TEXT = 1 << 24
FAIL_ON_TEXT = 1 << 25
FAIL_NEXT = 1 << 26
FAIL_ON_LBRACE = 1 << 27
FAIL_ON_RBRACE = 1 << 28
FAIL_ON_EQUALS = 1 << 29
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS)

@@ -163,7 +162,8 @@ GL_HEADING = 1 << 0
# Aggregate contexts:

FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE
UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY +
ARGUMENT_NAME + TAG_CLOSE)
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK

+ 4
- 6
mwparserfromhell/parser/tokenizer.c View File

@@ -1158,7 +1158,7 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
*/
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
{
#define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS
#define NOT_A_LINK \
if (!brackets && self->topstack->context & LC_DLTERM) \
return Tokenizer_handle_dl_term(self); \
@@ -2440,10 +2440,8 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
{
if (context & LC_FAIL_NEXT)
return -1;
if (context & LC_WIKILINK) {
if (context & LC_WIKILINK_TEXT)
return (data == '[' && Tokenizer_READ(self, 1) == '[') ? -1 : 0;
else if (data == ']' || data == '{')
if (context & LC_WIKILINK_TITLE) {
if (data == ']' || data == '{')
self->topstack->context |= LC_FAIL_NEXT;
else if (data == '\n' || data == '[' || data == '}')
return -1;
@@ -2577,7 +2575,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
return NULL;
}
else if (this == next && next == '[' && Tokenizer_CAN_RECURSE(self)) {
if (!(this_context & AGG_INVALID_LINK)) {
if (!(this_context & AGG_NO_WIKILINKS)) {
if (Tokenizer_parse_wikilink(self))
return NULL;
}


+ 34
- 34
mwparserfromhell/parser/tokenizer.h View File

@@ -121,40 +121,39 @@ static PyObject* TagCloseClose;
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_EXT_LINK 0x00000380
#define LC_EXT_LINK 0x00000180
#define LC_EXT_LINK_URI 0x00000080
#define LC_EXT_LINK_TITLE 0x00000100
#define LC_EXT_LINK_BRACKETS 0x00000200

#define LC_HEADING 0x0000FC00
#define LC_HEADING_LEVEL_1 0x00000400
#define LC_HEADING_LEVEL_2 0x00000800
#define LC_HEADING_LEVEL_3 0x00001000
#define LC_HEADING_LEVEL_4 0x00002000
#define LC_HEADING_LEVEL_5 0x00004000
#define LC_HEADING_LEVEL_6 0x00008000

#define LC_TAG 0x000F0000
#define LC_TAG_OPEN 0x00010000
#define LC_TAG_ATTR 0x00020000
#define LC_TAG_BODY 0x00040000
#define LC_TAG_CLOSE 0x00080000

#define LC_STYLE 0x00F00000
#define LC_STYLE_ITALICS 0x00100000
#define LC_STYLE_BOLD 0x00200000
#define LC_STYLE_PASS_AGAIN 0x00400000
#define LC_STYLE_SECOND_PASS 0x00800000

#define LC_DLTERM 0x01000000

#define LC_SAFETY_CHECK 0x7E000000
#define LC_HAS_TEXT 0x02000000
#define LC_FAIL_ON_TEXT 0x04000000
#define LC_FAIL_NEXT 0x08000000
#define LC_FAIL_ON_LBRACE 0x10000000
#define LC_FAIL_ON_RBRACE 0x20000000
#define LC_FAIL_ON_EQUALS 0x40000000

#define LC_HEADING 0x00007E00
#define LC_HEADING_LEVEL_1 0x00000200
#define LC_HEADING_LEVEL_2 0x00000400
#define LC_HEADING_LEVEL_3 0x00000800
#define LC_HEADING_LEVEL_4 0x00001000
#define LC_HEADING_LEVEL_5 0x00002000
#define LC_HEADING_LEVEL_6 0x00004000

#define LC_TAG 0x00078000
#define LC_TAG_OPEN 0x00008000
#define LC_TAG_ATTR 0x00010000
#define LC_TAG_BODY 0x00020000
#define LC_TAG_CLOSE 0x00040000

#define LC_STYLE 0x00780000
#define LC_STYLE_ITALICS 0x00080000
#define LC_STYLE_BOLD 0x00100000
#define LC_STYLE_PASS_AGAIN 0x00200000
#define LC_STYLE_SECOND_PASS 0x00400000

#define LC_DLTERM 0x00800000

#define LC_SAFETY_CHECK 0x3F000000
#define LC_HAS_TEXT 0x01000000
#define LC_FAIL_ON_TEXT 0x02000000
#define LC_FAIL_NEXT 0x04000000
#define LC_FAIL_ON_LBRACE 0x08000000
#define LC_FAIL_ON_RBRACE 0x10000000
#define LC_FAIL_ON_EQUALS 0x20000000

/* Global contexts: */

@@ -163,9 +162,10 @@ static PyObject* TagCloseClose;
/* Aggregate contexts: */

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

/* Tag contexts: */



+ 4
- 6
mwparserfromhell/parser/tokenizer.py View File

@@ -467,7 +467,7 @@ class Tokenizer(object):
reset = self._head
self._head += 1
try:
bad_context = self._context & contexts.INVALID_LINK
bad_context = self._context & contexts.NO_EXT_LINKS
if bad_context or not self._can_recurse():
raise BadRoute()
link, extra, delta = self._really_parse_external_link(brackets)
@@ -990,10 +990,8 @@ class Tokenizer(object):
context = self._context
if context & contexts.FAIL_NEXT:
return False
if context & contexts.WIKILINK:
if context & contexts.WIKILINK_TEXT:
return not (this == self._read(1) == "[")
elif this == "]" or this == "{":
if context & contexts.WIKILINK_TITLE:
if this == "]" or this == "{":
self._context |= contexts.FAIL_NEXT
elif this == "\n" or this == "[" or this == "}":
return False
@@ -1083,7 +1081,7 @@ class Tokenizer(object):
else:
self._emit_text("}")
elif this == next == "[" and self._can_recurse():
if not self._context & contexts.INVALID_LINK:
if not self._context & contexts.NO_WIKILINKS:
self._parse_wikilink()
else:
self._emit_text("[")


+ 28
- 0
tests/tokenizer/integration.mwtest View File

@@ -150,3 +150,31 @@ name: comment_inside_bracketed_link
label: an HTML comment inside a bracketed external link
input: "[http://example.com/foo<!--comment-->bar]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()]

---

name: wikilink_inside_external_link
label: a wikilink inside an external link, which the parser considers valid (see issue #61)
input: "[http://example.com/foo Foo [[Bar]]]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), ExternalLinkSeparator(), Text(text="Foo "), WikilinkOpen(), Text(text="Bar"), WikilinkClose(), ExternalLinkClose()]

---

name: external_link_inside_wikilink
label: an external link inside a wikilink, valid in the case of images (see issue #62)
input: "[[File:Example.png|thumb|http://example.com]]"
output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Text(text="thumb|"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), WikilinkClose()]

---

name: external_link_inside_wikilink_brackets
label: an external link with brackets inside a wikilink
input: "[[File:Example.png|thumb|[http://example.com Example]]]"
output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Text(text="thumb|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose(), WikilinkClose()]

---

name: external_link_inside_wikilink_title
label: an external link inside a wikilink title, which is invalid
input: "[[File:Example.png http://example.com]]"
output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()]

+ 16
- 14
tests/tokenizer/wikilinks.mwtest View File

@@ -54,6 +54,20 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[b

---

name: nested
label: a wikilink nested within another
input: "[[foo|[[bar]]]]"
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()]

---

name: nested_padding
label: a wikilink nested within another, separated by other data
input: "[[foo|a[[b]]c]]"
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()]

---

name: invalid_newline
label: invalid wikilink: newline as only content
input: "[[\n]]"
@@ -103,27 +117,13 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(),

---

name: invalid_nested_text
label: invalid wikilink: a wikilink nested within the value of another
name: invalid_nested_no_close
label: invalid wikilink: a wikilink nested within the value of another, missing a pair of closing brackets
input: "[[foo|[[bar]]"
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()]

---

name: invalid_nested_text_2
label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets
input: "[[foo|[[bar]]]]"
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")]


name: invalid_nested_text_padding
label: invalid wikilink: a wikilink nested within the value of another, separated by other data
input: "[[foo|a[[b]]c]]"
output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")]


name: incomplete_open_only
label: incomplete wikilinks: just an open
input: "[["


Loading…
Cancel
Save