From 1d5bbbe25b9e2d59cf5879f0df49f404ae44540c Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 5 Jun 2015 00:24:25 -0400
Subject: [PATCH] Disallow < and > in wikilink titles/template names (fixes
 #104)

---
 CHANGELOG                            |  2 ++
 docs/changelog.rst                   |  2 ++
 mwparserfromhell/parser/tokenizer.c  | 19 +++++++++++++------
 mwparserfromhell/parser/tokenizer.py | 11 ++++++++---
 4 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index a886bcb..c49aaf7 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,8 @@ v0.4.1 (unreleased):
   distributed along with new releases. Windows users can now take advantage of
   C speedups without having a compiler of their own.
 - Added support for Python 3.5.
+- '<' and '>' are now disallowed in wikilink titles and template names. This
+  includes when denoting tags, but not comments.
 - Fixed some bugs in the release scripts.
 
 v0.4 (released May 23, 2015):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index e94e2f3..3217a35 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -11,6 +11,8 @@ Unreleased
   distributed along with new releases. Windows users can now take advantage of
   C speedups without having a compiler of their own.
 - Added support for Python 3.5.
+- ``<`` and ``>`` are now disallowed in wikilink titles and template names.
+  This includes when denoting tags, but not comments.
 - Fixed some bugs in the release scripts.
 
 v0.4
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index c125021..ec0315f 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1555,9 +1555,9 @@ static int Tokenizer_parse_comment(Tokenizer* self)
             Py_DECREF(comment);
             self->head += 2;
             if (self->topstack->context & LC_FAIL_NEXT) {
-                /* _verify_safe() sets this flag while parsing a template name
-                   when it encounters what might be a comment -- we must unset
-                   it to let _verify_safe() know it was correct: */
+                /* _verify_safe() sets this flag while parsing a template or
+                   link when it encounters what might be a comment -- we must
+                   unset it to let _verify_safe() know it was correct: */
                 self->topstack->context ^= LC_FAIL_NEXT;
             }
             return 0;
@@ -2868,10 +2868,16 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
     if (context & LC_FAIL_NEXT)
         return -1;
     if (context & LC_WIKILINK_TITLE) {
-        if (data == ']' || data == '{')
+        if (data == ']' || data == '{') {
             self->topstack->context |= LC_FAIL_NEXT;
-        else if (data == '\n' || data == '[' || data == '}')
+        } else if (data == '\n' || data == '[' || data == '}' || data == '>') {
             return -1;
+        } else if (data == '<') {
+            if (Tokenizer_READ(self, 1) == '!')
+                self->topstack->context |= LC_FAIL_NEXT;
+            else
+                return -1;
+        }
         return 0;
     }
     if (context & LC_EXT_LINK_TITLE)
@@ -2883,7 +2889,8 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
             self->topstack->context |= LC_FAIL_NEXT;
             return 0;
         }
-        if (data == ']') {
+        if (data == ']' || data == '>' || (data == '<' &&
+                                           Tokenizer_READ(self, 1) != '!')) {
             return -1;
         }
         if (data == '|')
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 36c83e1..4d7d885 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -610,7 +610,7 @@ class Tokenizer(object):
                 self._head += 2
                 if self._context & contexts.FAIL_NEXT:
                     # _verify_safe() sets this flag while parsing a template
-                    # name when it encounters what might be a comment -- we
+                    # or link when it encounters what might be a comment -- we
                     # must unset it to let _verify_safe() know it was correct:
                     self._context ^= contexts.FAIL_NEXT
                 return
@@ -1172,8 +1172,13 @@ class Tokenizer(object):
         if context & contexts.WIKILINK_TITLE:
             if this == "]" or this == "{":
                 self._context |= contexts.FAIL_NEXT
-            elif this == "\n" or this == "[" or this == "}":
+            elif this == "\n" or this == "[" or this == "}" or this == ">":
                 return False
+            elif this == "<":
+                if self._read(1) == "!":
+                    self._context |= contexts.FAIL_NEXT
+                else:
+                    return False
             return True
         elif context & contexts.EXT_LINK_TITLE:
             return this != "\n"
@@ -1181,7 +1186,7 @@ class Tokenizer(object):
             if this == "{" or this == "}" or this == "[":
                 self._context |= contexts.FAIL_NEXT
                 return True
-            if this == "]":
+            if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"):
                 return False
             if this == "|":
                 return True