From 1946cf621dbc6d41ac280d18daf04979e567a698 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sun, 27 Oct 2013 23:03:23 -0400
Subject: [PATCH] Add a temporary skip_style_tags until we resolve some issues.

---
 CHANGELOG                            |  3 +++
 docs/changelog.rst                   |  4 ++++
 mwparserfromhell/parser/__init__.py  | 10 +++++++---
 mwparserfromhell/parser/tokenizer.c  | 10 ++++++----
 mwparserfromhell/parser/tokenizer.h  |  1 +
 mwparserfromhell/parser/tokenizer.py |  5 +++--
 tests/_test_tree_equality.py         |  2 +-
 tests/test_parser.py                 | 25 ++++++++++++++++++++++++-
 8 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 30ddb9e..558e5cb 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -9,6 +9,9 @@ v0.4 (unreleased):
 - Wikicode.get_sections() now returns sections in the correct order.
 - Wikicode.matches() now accepts a tuple or list of strings/Wikicode objects
   instead of just a single string or Wikicode.
+- Given the frequency of issues with the (admittedly insufficient) tag parser,
+  there's a temporary skip_style_tags argument to parse() that ignores '' and
+  ''' until these issues are corrected.
 - C code cleanup and speed improvements.
 
 v0.3.2 (released September 1, 2013):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 83f4b88..07b02da 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -19,6 +19,10 @@ Unreleased
 - :py:meth:`.Wikicode.matches` now accepts a tuple or list of
   strings/:py:class:`.Wikicode` objects instead of just a single string or
   :py:class:`.Wikicode`.
+- Given the frequency of issues with the (admittedly insufficient) tag parser,
+  there's a temporary *skip_style_tags* argument to
+  :py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until
+  these issues are corrected.
 - C code cleanup and speed improvements.
 
 v0.3.2
diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py
index 81dea9b..6cbfa3a 100644
--- a/mwparserfromhell/parser/__init__.py
+++ b/mwparserfromhell/parser/__init__.py
@@ -53,8 +53,12 @@ class Parser(object):
             self._tokenizer = Tokenizer()
         self._builder = Builder()
 
-    def parse(self, text, context=0):
-        """Parse *text*, returning a :py:class:`~.Wikicode` object tree."""
-        tokens = self._tokenizer.tokenize(text, context)
+    def parse(self, text, context=0, skip_style_tags=False):
+        """Parse *text*, returning a :py:class:`~.Wikicode` object tree.
+
+        If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be
+        parsed, but instead be treated as plain text.
+        """
+        tokens = self._tokenizer.tokenize(text, context, skip_style_tags)
         code = self._builder.build(tokens)
         return code
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index ac0d863..c37d8dc 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -2640,7 +2640,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
         }
         else if (this == '>' && this_context & LC_TAG_CLOSE)
             return Tokenizer_handle_tag_close_close(self);
-        else if (this == next && next == '\'') {
+        else if (this == next && next == '\'' && !self->skip_style_tags) {
             temp = Tokenizer_parse_style(self);
             if (temp != Py_None)
                 return temp;
@@ -2675,9 +2675,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
 static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
 {
     PyObject *text, *temp;
-    int context = 0;
+    int context = 0, skip_style_tags = 0;
 
-    if (PyArg_ParseTuple(args, "U|i", &text, &context)) {
+    if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
         Py_XDECREF(self->text);
         self->text = PySequence_Fast(text, "expected a sequence");
     }
@@ -2686,7 +2686,8 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
         Py_ssize_t size;
         /* Failed to parse a Unicode object; try a string instead. */
         PyErr_Clear();
-        if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context))
+        if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
+                              &skip_style_tags))
             return NULL;
         temp = PyUnicode_FromStringAndSize(encoded, size);
         if (!text)
@@ -2698,6 +2699,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
     }
     self->head = self->global = self->depth = self->cycles = 0;
     self->length = PyList_GET_SIZE(self->text);
+    self->skip_style_tags = skip_style_tags;
     return Tokenizer_parse(self, context, 1);
 }
 
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 4b28e02..ef5acd6 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -223,6 +223,7 @@ typedef struct {
     int global;             /* global context */
     int depth;              /* stack recursion depth */
     int cycles;             /* total number of stack recursions */
+    int skip_style_tags;    /* temporary fix for the sometimes broken tag parser */
 } Tokenizer;
 
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 35a2b09..8d12b62 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -1124,7 +1124,7 @@ class Tokenizer(object):
                     self._emit_text("<")
             elif this == ">" and self._context & contexts.TAG_CLOSE:
                 return self._handle_tag_close_close()
-            elif this == next == "'":
+            elif this == next == "'" and not self._skip_style_tags:
                 result = self._parse_style()
                 if result is not None:
                     return result
@@ -1141,8 +1141,9 @@ class Tokenizer(object):
                 self._emit_text(this)
             self._head += 1
 
-    def tokenize(self, text, context=0):
+    def tokenize(self, text, context=0, skip_style_tags=False):
         """Build a list of tokens from a string of wikicode and return it."""
+        self._skip_style_tags = skip_style_tags
         split = self.regex.split(text)
         self._text = [segment for segment in split if segment]
         self._head = self._global = self._depth = self._cycles = 0
diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py
index d6d92f1..38350d8 100644
--- a/tests/_test_tree_equality.py
+++ b/tests/_test_tree_equality.py
@@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase):
             self.assertEqual(exp_attr.pad_first, act_attr.pad_first)
             self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq)
             self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq)
-        self.assertIs(expected.wiki_markup, actual.wiki_markup)
+        self.assertEqual(expected.wiki_markup, actual.wiki_markup)
         self.assertIs(expected.self_closing, actual.self_closing)
         self.assertIs(expected.invalid, actual.invalid)
         self.assertIs(expected.implicit, actual.implicit)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 5c50b01..672cbff 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -24,7 +24,7 @@ from __future__ import unicode_literals
 import unittest
 
 from mwparserfromhell import parser
-from mwparserfromhell.nodes import Template, Text, Wikilink
+from mwparserfromhell.nodes import Tag, Template, Text, Wikilink
 from mwparserfromhell.nodes.extras import Parameter
 
 from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -35,10 +35,12 @@ class TestParser(TreeEqualityTestCase):
 
     def test_use_c(self):
         """make sure the correct tokenizer is used"""
+        restore = parser.use_c
         if parser.use_c:
             self.assertTrue(parser.Parser()._tokenizer.USES_C)
             parser.use_c = False
         self.assertFalse(parser.Parser()._tokenizer.USES_C)
+        parser.use_c = restore
 
     def test_parsing(self):
         """integration test for parsing overall"""
@@ -62,5 +64,26 @@ class TestParser(TreeEqualityTestCase):
         actual = parser.Parser().parse(text)
         self.assertWikicodeEqual(expected, actual)
 
+    def test_skip_style_tags(self):
+        """test Parser.parse(skip_style_tags=True)"""
+        def test():
+            with_style = parser.Parser().parse(text, skip_style_tags=False)
+            without_style = parser.Parser().parse(text, skip_style_tags=True)
+            self.assertWikicodeEqual(a, with_style)
+            self.assertWikicodeEqual(b, without_style)
+
+        text = "This is an example with ''italics''!"
+        a = wrap([Text("This is an example with "),
+                  Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"),
+                  Text("!")])
+        b = wraptext("This is an example with ''italics''!")
+
+        restore = parser.use_c
+        if parser.use_c:
+            test()
+            parser.use_c = False
+        test()
+        parser.use_c = restore
+
 if __name__ == "__main__":
     unittest.main(verbosity=2)