Add a temporary skip_style_tags until we resolve some issues.

11 years ago · 1946cf621d
--- a/+ 3
+++ b/+ 3
@@ -9,6 +9,9 @@ v0.4 (unreleased):
 - Wikicode.get_sections() now returns sections in the correct order.
 - Wikicode.matches() now accepts a tuple or list of strings/Wikicode objects
  instead of just a single string or Wikicode.
 - Given the frequency of issues with the (admittedly insufficient) tag parser,
  there's a temporary skip_style_tags argument to parse() that ignores '' and
  ''' until these issues are corrected.
 - C code cleanup and speed improvements.
 v0.3.2 (released September 1, 2013):
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -19,6 +19,10 @@ Unreleased
 - :py:meth:`.Wikicode.matches` now accepts a tuple or list of
  strings/:py:class:`.Wikicode` objects instead of just a single string or
  :py:class:`.Wikicode`.
 - Given the frequency of issues with the (admittedly insufficient) tag parser,
  there's a temporary *skip_style_tags* argument to
  :py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until
  these issues are corrected.
 - C code cleanup and speed improvements.
 v0.3.2
--- a/mwparserfromhell/parser/init.py
+++ b/mwparserfromhell/parser/init.py
@@ -53,8 +53,12 @@ class Parser(object):
            self._tokenizer = Tokenizer()
        self._builder = Builder()
    def parse(self, text, context=0):
        """Parse *text*, returning a :py:class:`~.Wikicode` object tree."""
        tokens = self._tokenizer.tokenize(text, context)
    def parse(self, text, context=0, skip_style_tags=False):
        """Parse *text*, returning a :py:class:`~.Wikicode` object tree.
        If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be
        parsed, but instead be treated as plain text.
        """
        tokens = self._tokenizer.tokenize(text, context, skip_style_tags)
        code = self._builder.build(tokens)
        return code
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -2640,7 +2640,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
        }
        else if (this == '>' && this_context & LC_TAG_CLOSE)
            return Tokenizer_handle_tag_close_close(self);
        else if (this == next && next == '\'') {
        else if (this == next && next == '\'' && !self->skip_style_tags) {
            temp = Tokenizer_parse_style(self);
            if (temp != Py_None)
                return temp;
@@ -2675,9 +2675,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
 static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
 {
    PyObject *text, *temp;
    int context = 0;
    int context = 0, skip_style_tags = 0;
    if (PyArg_ParseTuple(args, "U|i", &text, &context)) {
    if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
        Py_XDECREF(self->text);
        self->text = PySequence_Fast(text, "expected a sequence");
    }
@@ -2686,7 +2686,8 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
        Py_ssize_t size;
        /* Failed to parse a Unicode object; try a string instead. */
        PyErr_Clear();
        if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context))
        if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
                              &skip_style_tags))
            return NULL;
        temp = PyUnicode_FromStringAndSize(encoded, size);
        if (!text)
@@ -2698,6 +2699,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
    }
    self->head = self->global = self->depth = self->cycles = 0;
    self->length = PyList_GET_SIZE(self->text);
    self->skip_style_tags = skip_style_tags;
    return Tokenizer_parse(self, context, 1);
 }
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -223,6 +223,7 @@ typedef struct {
    int global;             /* global context */
    int depth;              /* stack recursion depth */
    int cycles;             /* total number of stack recursions */
    int skip_style_tags;    /* temporary fix for the sometimes broken tag parser */
 } Tokenizer;
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -1124,7 +1124,7 @@ class Tokenizer(object):
                    self._emit_text("<")
            elif this == ">" and self._context & contexts.TAG_CLOSE:
                return self._handle_tag_close_close()
            elif this == next == "'":
            elif this == next == "'" and not self._skip_style_tags:
                result = self._parse_style()
                if result is not None:
                    return result
@@ -1141,8 +1141,9 @@ class Tokenizer(object):
                self._emit_text(this)
            self._head += 1
    def tokenize(self, text, context=0):
    def tokenize(self, text, context=0, skip_style_tags=False):
        """Build a list of tokens from a string of wikicode and return it."""
        self._skip_style_tags = skip_style_tags
        split = self.regex.split(text)
        self._text = [segment for segment in split if segment]
        self._head = self._global = self._depth = self._cycles = 0
--- a/tests/_test_tree_equality.py
+++ b/tests/_test_tree_equality.py
@@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase):
            self.assertEqual(exp_attr.pad_first, act_attr.pad_first)
            self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq)
            self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq)
        self.assertIs(expected.wiki_markup, actual.wiki_markup)
        self.assertEqual(expected.wiki_markup, actual.wiki_markup)
        self.assertIs(expected.self_closing, actual.self_closing)
        self.assertIs(expected.invalid, actual.invalid)
        self.assertIs(expected.implicit, actual.implicit)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -24,7 +24,7 @@ from __future__ import unicode_literals
 import unittest
 from mwparserfromhell import parser
 from mwparserfromhell.nodes import Template, Text, Wikilink
 from mwparserfromhell.nodes import Tag, Template, Text, Wikilink
 from mwparserfromhell.nodes.extras import Parameter
 from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -35,10 +35,12 @@ class TestParser(TreeEqualityTestCase):
    def test_use_c(self):
        """make sure the correct tokenizer is used"""
        restore = parser.use_c
        if parser.use_c:
            self.assertTrue(parser.Parser()._tokenizer.USES_C)
            parser.use_c = False
        self.assertFalse(parser.Parser()._tokenizer.USES_C)
        parser.use_c = restore
    def test_parsing(self):
        """integration test for parsing overall"""
@@ -62,5 +64,26 @@ class TestParser(TreeEqualityTestCase):
        actual = parser.Parser().parse(text)
        self.assertWikicodeEqual(expected, actual)
    def test_skip_style_tags(self):
        """test Parser.parse(skip_style_tags=True)"""
        def test():
            with_style = parser.Parser().parse(text, skip_style_tags=False)
            without_style = parser.Parser().parse(text, skip_style_tags=True)
            self.assertWikicodeEqual(a, with_style)
            self.assertWikicodeEqual(b, without_style)
        text = "This is an example with ''italics''!"
        a = wrap([Text("This is an example with "),
                  Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"),
                  Text("!")])
        b = wraptext("This is an example with ''italics''!")
        restore = parser.use_c
        if parser.use_c:
            test()
            parser.use_c = False
        test()
        parser.use_c = restore
 if __name__ == "__main__":
    unittest.main(verbosity=2)