Browse Source

Edge cases involving wikilink -> external link fallback (fixes #120)

tags/v0.4.3
Ben Kurtovic 8 years ago
parent
commit
4f3ab48375
6 changed files with 101 additions and 21 deletions
  1. +1
    -1
      CHANGELOG
  2. +1
    -1
      docs/changelog.rst
  3. +53
    -11
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  4. +24
    -7
      mwparserfromhell/parser/tokenizer.py
  5. +7
    -0
      tests/tokenizer/external_links.mwtest
  6. +15
    -1
      tests/tokenizer/integration.mwtest

+ 1
- 1
CHANGELOG View File

@@ -1,6 +1,6 @@
v0.5 (unreleased):

-
- Fixed edge cases involving wikilinks inside of external links and vice versa.

v0.4.2 (released July 30, 2015):



+ 1
- 1
docs/changelog.rst View File

@@ -7,7 +7,7 @@ v0.5
Unreleased
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4.2...develop>`__):

-
- Fixed edge cases involving wikilinks inside of external links and vice versa.

v0.4.2
------


+ 53
- 11
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -47,6 +47,8 @@ typedef struct {

/* Forward declarations */

static PyObject* Tokenizer_really_parse_external_link(
Tokenizer*, int, Textbuffer*);
static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_handle_dl_term(Tokenizer*);
@@ -362,30 +364,70 @@ static PyObject* Tokenizer_handle_argument_end(Tokenizer* self)
static int Tokenizer_parse_wikilink(Tokenizer* self)
{
Py_ssize_t reset;
PyObject *wikilink;
PyObject *extlink, *wikilink, *kwargs;

reset = self->head + 1;
self->head += 2;
reset = self->head - 1;
wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1);
// If the wikilink looks like an external link, parse it as such:
extlink = Tokenizer_really_parse_external_link(self, 1, NULL);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset + 1;
// Otherwise, actually parse it as a wikilink:
wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, "[["))
return -1;
return 0;
}
if (!wikilink)
return -1;
if (Tokenizer_emit(self, WikilinkOpen)) {
Py_DECREF(wikilink);
return -1;
}
if (Tokenizer_emit_all(self, wikilink)) {
Py_DECREF(wikilink);
return -1;
}
Py_DECREF(wikilink);
if (Tokenizer_emit(self, WikilinkClose))
return -1;
return 0;
}
if (!extlink)
return -1;
if (self->topstack->context & LC_EXT_LINK_TITLE) {
// In this exceptional case, an external link that looks like a
// wikilink inside of an external link is parsed as text:
Py_DECREF(extlink);
self->head = reset;
if (Tokenizer_emit_text(self, "[["))
return -1;
return 0;
}
if (!wikilink)
if (Tokenizer_emit_text(self, "[")) {
Py_DECREF(extlink);
return -1;
if (Tokenizer_emit(self, WikilinkOpen)) {
Py_DECREF(wikilink);
}
kwargs = PyDict_New();
if (!kwargs) {
Py_DECREF(extlink);
return -1;
}
if (Tokenizer_emit_all(self, wikilink)) {
Py_DECREF(wikilink);
PyDict_SetItemString(kwargs, "brackets", Py_True);
if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) {
Py_DECREF(extlink);
return -1;
}
if (Tokenizer_emit_all(self, extlink)) {
Py_DECREF(extlink);
return -1;
}
Py_DECREF(wikilink);
if (Tokenizer_emit(self, WikilinkClose))
Py_DECREF(extlink);
if (Tokenizer_emit(self, ExternalLinkClose))
return -1;
return 0;
}
@@ -553,7 +595,7 @@ static int Tokenizer_handle_free_link_text(
Tokenizer* self, int* parens, Textbuffer* tail, Unicode this)
{
#define PUSH_TAIL_BUFFER(tail, error) \
if (tail->length > 0) { \
if (tail && tail->length > 0) { \
if (Textbuffer_concat(self->topstack->textbuffer, tail)) \
return error; \
if (Textbuffer_reset(tail)) \


+ 24
- 7
mwparserfromhell/parser/tokenizer.py View File

@@ -299,17 +299,34 @@ class Tokenizer(object):

def _parse_wikilink(self):
"""Parse an internal wikilink at the head of the wikicode string."""
reset = self._head + 1
self._head += 2
reset = self._head - 1
try:
wikilink = self._parse(contexts.WIKILINK_TITLE)
# If the wikilink looks like an external link, parse it as such:
link, extra, delta = self._really_parse_external_link(True)
except BadRoute:
self._head = reset
self._emit_text("[[")
self._head = reset + 1
try:
# Otherwise, actually parse it as a wikilink:
wikilink = self._parse(contexts.WIKILINK_TITLE)
except BadRoute:
self._head = reset
self._emit_text("[[")
else:
self._emit(tokens.WikilinkOpen())
self._emit_all(wikilink)
self._emit(tokens.WikilinkClose())
else:
self._emit(tokens.WikilinkOpen())
self._emit_all(wikilink)
self._emit(tokens.WikilinkClose())
if self._context & contexts.EXT_LINK_TITLE:
# In this exceptional case, an external link that looks like a
# wikilink inside of an external link is parsed as text:
self._head = reset
self._emit_text("[[")
return
self._emit_text("[")
self._emit(tokens.ExternalLinkOpen(brackets=True))
self._emit_all(link)
self._emit(tokens.ExternalLinkClose())

def _handle_wikilink_separator(self):
"""Handle the separator between a wikilink's title and its text."""


+ 7
- 0
tests/tokenizer/external_links.mwtest View File

@@ -82,6 +82,13 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), Exter

---

name: brackets_recursive_2
label: bracket-enclosed link with a double bracket-enclosed link as the title
input: "[http://example.com [[http://example.com]]]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[[http://example.com"), ExternalLinkClose(), Text(text="]]")]

---

name: period_after
label: a period after a free link that is excluded
input: "http://example.com."


+ 15
- 1
tests/tokenizer/integration.mwtest View File

@@ -175,7 +175,7 @@ output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Tex
---

name: external_link_inside_wikilink_title
label: an external link inside a wikilink title, which is invalid
label: an external link inside a wikilink title, which is not parsed
input: "[[File:Example.png http://example.com]]"
output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()]

@@ -318,3 +318,17 @@ name: incomplete_comment_in_link_title_6
label: incomplete comments are invalid in link titles
input: "[[foo<!--bar"
output: [Text(text="[[foo<!--bar")]

---

name: wikilink_to_external_link_fallback
label: an external link enclosed in an extra pair of brackets (see issue #120)
input: "[[http://example.com foo bar]]"
output: [Text(text="["), ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="foo bar"), ExternalLinkClose(), Text(text="]")]

---

name: wikilink_to_external_link_fallback_2
label: an external link enclosed in an extra pair of brackets (see issue #120)
input: "[[http://example.com]]"
output: [Text(text="["), ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkClose(), Text(text="]")]

Loading…
Cancel
Save