From 1d5bbbe25b9e2d59cf5879f0df49f404ae44540c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 5 Jun 2015 00:24:25 -0400 Subject: [PATCH] Disallow < and > in wikilink titles/template names (fixes #104) --- CHANGELOG | 2 ++ docs/changelog.rst | 2 ++ mwparserfromhell/parser/tokenizer.c | 19 +++++++++++++------ mwparserfromhell/parser/tokenizer.py | 11 ++++++++--- 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a886bcb..c49aaf7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,8 @@ v0.4.1 (unreleased): distributed along with new releases. Windows users can now take advantage of C speedups without having a compiler of their own. - Added support for Python 3.5. +- '<' and '>' are now disallowed in wikilink titles and template names. This + includes when denoting tags, but not comments. - Fixed some bugs in the release scripts. v0.4 (released May 23, 2015): diff --git a/docs/changelog.rst b/docs/changelog.rst index e94e2f3..3217a35 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -11,6 +11,8 @@ Unreleased distributed along with new releases. Windows users can now take advantage of C speedups without having a compiler of their own. - Added support for Python 3.5. +- ``<`` and ``>`` are now disallowed in wikilink titles and template names. + This includes when denoting tags, but not comments. - Fixed some bugs in the release scripts. v0.4 diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c125021..ec0315f 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1555,9 +1555,9 @@ static int Tokenizer_parse_comment(Tokenizer* self) Py_DECREF(comment); self->head += 2; if (self->topstack->context & LC_FAIL_NEXT) { - /* _verify_safe() sets this flag while parsing a template name - when it encounters what might be a comment -- we must unset - it to let _verify_safe() know it was correct: */ + /* _verify_safe() sets this flag while parsing a template or + link when it encounters what might be a comment -- we must + unset it to let _verify_safe() know it was correct: */ self->topstack->context ^= LC_FAIL_NEXT; } return 0; @@ -2868,10 +2868,16 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) if (context & LC_FAIL_NEXT) return -1; if (context & LC_WIKILINK_TITLE) { - if (data == ']' || data == '{') + if (data == ']' || data == '{') { self->topstack->context |= LC_FAIL_NEXT; - else if (data == '\n' || data == '[' || data == '}') + } else if (data == '\n' || data == '[' || data == '}' || data == '>') { return -1; + } else if (data == '<') { + if (Tokenizer_READ(self, 1) == '!') + self->topstack->context |= LC_FAIL_NEXT; + else + return -1; + } return 0; } if (context & LC_EXT_LINK_TITLE) @@ -2883,7 +2889,8 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) self->topstack->context |= LC_FAIL_NEXT; return 0; } - if (data == ']') { + if (data == ']' || data == '>' || (data == '<' && + Tokenizer_READ(self, 1) != '!')) { return -1; } if (data == '|') diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 36c83e1..4d7d885 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -610,7 +610,7 @@ class Tokenizer(object): self._head += 2 if self._context & contexts.FAIL_NEXT: # _verify_safe() sets this flag while parsing a template - # name when it encounters what might be a comment -- we + # or link when it encounters what might be a comment -- we # must unset it to let _verify_safe() know it was correct: self._context ^= contexts.FAIL_NEXT return @@ -1172,8 +1172,13 @@ class Tokenizer(object): if context & contexts.WIKILINK_TITLE: if this == "]" or this == "{": self._context |= contexts.FAIL_NEXT - elif this == "\n" or this == "[" or this == "}": + elif this == "\n" or this == "[" or this == "}" or this == ">": return False + elif this == "<": + if self._read(1) == "!": + self._context |= contexts.FAIL_NEXT + else: + return False return True elif context & contexts.EXT_LINK_TITLE: return this != "\n" @@ -1181,7 +1186,7 @@ class Tokenizer(object): if this == "{" or this == "}" or this == "[": self._context |= contexts.FAIL_NEXT return True - if this == "]": + if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"): return False if this == "|": return True