diff --git a/CHANGELOG b/CHANGELOG index d95b3a3..462d2dc 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ v0.5 (unreleased): -- +- Fixed edge cases involving wikilinks inside of external links and vice versa. v0.4.2 (released July 30, 2015): diff --git a/docs/changelog.rst b/docs/changelog.rst index acb09fc..7ca9f29 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,7 +7,7 @@ v0.5 Unreleased (`changes `__): -- +- Fixed edge cases involving wikilinks inside of external links and vice versa. v0.4.2 ------ diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 23cc246..60eef6e 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -47,6 +47,8 @@ typedef struct { /* Forward declarations */ +static PyObject* Tokenizer_really_parse_external_link( + Tokenizer*, int, Textbuffer*); static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*); @@ -362,30 +364,70 @@ static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) static int Tokenizer_parse_wikilink(Tokenizer* self) { Py_ssize_t reset; - PyObject *wikilink; + PyObject *extlink, *wikilink, *kwargs; + reset = self->head + 1; self->head += 2; - reset = self->head - 1; - wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1); + // If the wikilink looks like an external link, parse it as such: + extlink = Tokenizer_really_parse_external_link(self, 1, NULL); if (BAD_ROUTE) { RESET_ROUTE(); + self->head = reset + 1; + // Otherwise, actually parse it as a wikilink: + wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_text(self, "[[")) + return -1; + return 0; + } + if (!wikilink) + return -1; + if (Tokenizer_emit(self, WikilinkOpen)) { + Py_DECREF(wikilink); + return -1; + } + if (Tokenizer_emit_all(self, wikilink)) { + Py_DECREF(wikilink); + return -1; + } + Py_DECREF(wikilink); + if (Tokenizer_emit(self, WikilinkClose)) + return -1; + return 0; + } + if (!extlink) + return -1; + if (self->topstack->context & LC_EXT_LINK_TITLE) { + // In this exceptional case, an external link that looks like a + // wikilink inside of an external link is parsed as text: + Py_DECREF(extlink); self->head = reset; if (Tokenizer_emit_text(self, "[[")) return -1; return 0; } - if (!wikilink) + if (Tokenizer_emit_text(self, "[")) { + Py_DECREF(extlink); return -1; - if (Tokenizer_emit(self, WikilinkOpen)) { - Py_DECREF(wikilink); + } + kwargs = PyDict_New(); + if (!kwargs) { + Py_DECREF(extlink); return -1; } - if (Tokenizer_emit_all(self, wikilink)) { - Py_DECREF(wikilink); + PyDict_SetItemString(kwargs, "brackets", Py_True); + if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { + Py_DECREF(extlink); + return -1; + } + if (Tokenizer_emit_all(self, extlink)) { + Py_DECREF(extlink); return -1; } - Py_DECREF(wikilink); - if (Tokenizer_emit(self, WikilinkClose)) + Py_DECREF(extlink); + if (Tokenizer_emit(self, ExternalLinkClose)) return -1; return 0; } @@ -553,7 +595,7 @@ static int Tokenizer_handle_free_link_text( Tokenizer* self, int* parens, Textbuffer* tail, Unicode this) { #define PUSH_TAIL_BUFFER(tail, error) \ - if (tail->length > 0) { \ + if (tail && tail->length > 0) { \ if (Textbuffer_concat(self->topstack->textbuffer, tail)) \ return error; \ if (Textbuffer_reset(tail)) \ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 5c89455..3a1c775 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -299,17 +299,34 @@ class Tokenizer(object): def _parse_wikilink(self): """Parse an internal wikilink at the head of the wikicode string.""" + reset = self._head + 1 self._head += 2 - reset = self._head - 1 try: - wikilink = self._parse(contexts.WIKILINK_TITLE) + # If the wikilink looks like an external link, parse it as such: + link, extra, delta = self._really_parse_external_link(True) except BadRoute: - self._head = reset - self._emit_text("[[") + self._head = reset + 1 + try: + # Otherwise, actually parse it as a wikilink: + wikilink = self._parse(contexts.WIKILINK_TITLE) + except BadRoute: + self._head = reset + self._emit_text("[[") + else: + self._emit(tokens.WikilinkOpen()) + self._emit_all(wikilink) + self._emit(tokens.WikilinkClose()) else: - self._emit(tokens.WikilinkOpen()) - self._emit_all(wikilink) - self._emit(tokens.WikilinkClose()) + if self._context & contexts.EXT_LINK_TITLE: + # In this exceptional case, an external link that looks like a + # wikilink inside of an external link is parsed as text: + self._head = reset + self._emit_text("[[") + return + self._emit_text("[") + self._emit(tokens.ExternalLinkOpen(brackets=True)) + self._emit_all(link) + self._emit(tokens.ExternalLinkClose()) def _handle_wikilink_separator(self): """Handle the separator between a wikilink's title and its text.""" diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index 1abc74f..d2efdfc 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -82,6 +82,13 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), Exter --- +name: brackets_recursive_2 +label: bracket-enclosed link with a double bracket-enclosed link as the title +input: "[http://example.com [[http://example.com]]]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[[http://example.com"), ExternalLinkClose(), Text(text="]]")] + +--- + name: period_after label: a period after a free link that is excluded input: "http://example.com." diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 4d6b940..5b8ff25 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -175,7 +175,7 @@ output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Tex --- name: external_link_inside_wikilink_title -label: an external link inside a wikilink title, which is invalid +label: an external link inside a wikilink title, which is not parsed input: "[[File:Example.png http://example.com]]" output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()] @@ -318,3 +318,17 @@ name: incomplete_comment_in_link_title_6 label: incomplete comments are invalid in link titles input: "[[foo