Disallow < and > in wikilink titles/template names (fixes #104)

9 years ago · 1d5bbbe25b
--- a/+ 2
+++ b/+ 2
@@ -4,6 +4,8 @@ v0.4.1 (unreleased):
  distributed along with new releases. Windows users can now take advantage of
  C speedups without having a compiler of their own.
 - Added support for Python 3.5.
 - '<' and '>' are now disallowed in wikilink titles and template names. This
  includes when denoting tags, but not comments.
 - Fixed some bugs in the release scripts.

 v0.4 (released May 23, 2015):
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -11,6 +11,8 @@ Unreleased
  distributed along with new releases. Windows users can now take advantage of
  C speedups without having a compiler of their own.
 - Added support for Python 3.5.
 - ``<`` and ``>`` are now disallowed in wikilink titles and template names.
  This includes when denoting tags, but not comments.
 - Fixed some bugs in the release scripts.

 v0.4
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1555,9 +1555,9 @@ static int Tokenizer_parse_comment(Tokenizer* self)
            Py_DECREF(comment);
            self->head += 2;
            if (self->topstack->context & LC_FAIL_NEXT) {
                /* _verify_safe() sets this flag while parsing a template name
                   when it encounters what might be a comment -- we must unset
                   it to let _verify_safe() know it was correct: */
                /* _verify_safe() sets this flag while parsing a template or
                   link when it encounters what might be a comment -- we must
                   unset it to let _verify_safe() know it was correct: */
                self->topstack->context ^= LC_FAIL_NEXT;
            }
            return 0;
@@ -2868,10 +2868,16 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
    if (context & LC_FAIL_NEXT)
        return -1;
    if (context & LC_WIKILINK_TITLE) {
        if (data == ']' || data == '{')
        if (data == ']' || data == '{') {
            self->topstack->context |= LC_FAIL_NEXT;
        else if (data == '\n' || data == '[' || data == '}')
        } else if (data == '\n' || data == '[' || data == '}' || data == '>') {
            return -1;
        } else if (data == '<') {
            if (Tokenizer_READ(self, 1) == '!')
                self->topstack->context |= LC_FAIL_NEXT;
            else
                return -1;
        }
        return 0;
    }
    if (context & LC_EXT_LINK_TITLE)
@@ -2883,7 +2889,8 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
            self->topstack->context |= LC_FAIL_NEXT;
            return 0;
        }
        if (data == ']') {
        if (data == ']' || data == '>' || (data == '<' &&
                                           Tokenizer_READ(self, 1) != '!')) {
            return -1;
        }
        if (data == '|')
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -610,7 +610,7 @@ class Tokenizer(object):
                self._head += 2
                if self._context & contexts.FAIL_NEXT:
                    # _verify_safe() sets this flag while parsing a template
                    # name when it encounters what might be a comment -- we
                    # or link when it encounters what might be a comment -- we
                    # must unset it to let _verify_safe() know it was correct:
                    self._context ^= contexts.FAIL_NEXT
                return
@@ -1172,8 +1172,13 @@ class Tokenizer(object):
        if context & contexts.WIKILINK_TITLE:
            if this == "]" or this == "{":
                self._context |= contexts.FAIL_NEXT
            elif this == "\n" or this == "[" or this == "}":
            elif this == "\n" or this == "[" or this == "}" or this == ">":
                return False
            elif this == "<":
                if self._read(1) == "!":
                    self._context |= contexts.FAIL_NEXT
                else:
                    return False
            return True
        elif context & contexts.EXT_LINK_TITLE:
            return this != "\n"
@@ -1181,7 +1186,7 @@ class Tokenizer(object):
            if this == "{" or this == "}" or this == "[":
                self._context |= contexts.FAIL_NEXT
                return True
            if this == "]":
            if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"):
                return False
            if this == "|":
                return True