From 1946cf621dbc6d41ac280d18daf04979e567a698 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 27 Oct 2013 23:03:23 -0400 Subject: [PATCH] Add a temporary skip_style_tags until we resolve some issues. --- CHANGELOG | 3 +++ docs/changelog.rst | 4 ++++ mwparserfromhell/parser/__init__.py | 10 +++++++--- mwparserfromhell/parser/tokenizer.c | 10 ++++++---- mwparserfromhell/parser/tokenizer.h | 1 + mwparserfromhell/parser/tokenizer.py | 5 +++-- tests/_test_tree_equality.py | 2 +- tests/test_parser.py | 25 ++++++++++++++++++++++++- 8 files changed, 49 insertions(+), 11 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 30ddb9e..558e5cb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -9,6 +9,9 @@ v0.4 (unreleased): - Wikicode.get_sections() now returns sections in the correct order. - Wikicode.matches() now accepts a tuple or list of strings/Wikicode objects instead of just a single string or Wikicode. +- Given the frequency of issues with the (admittedly insufficient) tag parser, + there's a temporary skip_style_tags argument to parse() that ignores '' and + ''' until these issues are corrected. - C code cleanup and speed improvements. v0.3.2 (released September 1, 2013): diff --git a/docs/changelog.rst b/docs/changelog.rst index 83f4b88..07b02da 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -19,6 +19,10 @@ Unreleased - :py:meth:`.Wikicode.matches` now accepts a tuple or list of strings/:py:class:`.Wikicode` objects instead of just a single string or :py:class:`.Wikicode`. +- Given the frequency of issues with the (admittedly insufficient) tag parser, + there's a temporary *skip_style_tags* argument to + :py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until + these issues are corrected. - C code cleanup and speed improvements. v0.3.2 diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 81dea9b..6cbfa3a 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -53,8 +53,12 @@ class Parser(object): self._tokenizer = Tokenizer() self._builder = Builder() - def parse(self, text, context=0): - """Parse *text*, returning a :py:class:`~.Wikicode` object tree.""" - tokens = self._tokenizer.tokenize(text, context) + def parse(self, text, context=0, skip_style_tags=False): + """Parse *text*, returning a :py:class:`~.Wikicode` object tree. + + If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be + parsed, but instead be treated as plain text. + """ + tokens = self._tokenizer.tokenize(text, context, skip_style_tags) code = self._builder.build(tokens) return code diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ac0d863..c37d8dc 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2640,7 +2640,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) } else if (this == '>' && this_context & LC_TAG_CLOSE) return Tokenizer_handle_tag_close_close(self); - else if (this == next && next == '\'') { + else if (this == next && next == '\'' && !self->skip_style_tags) { temp = Tokenizer_parse_style(self); if (temp != Py_None) return temp; @@ -2675,9 +2675,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp; - int context = 0; + int context = 0, skip_style_tags = 0; - if (PyArg_ParseTuple(args, "U|i", &text, &context)) { + if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { Py_XDECREF(self->text); self->text = PySequence_Fast(text, "expected a sequence"); } @@ -2686,7 +2686,8 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) Py_ssize_t size; /* Failed to parse a Unicode object; try a string instead. */ PyErr_Clear(); - if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context)) + if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context, + &skip_style_tags)) return NULL; temp = PyUnicode_FromStringAndSize(encoded, size); if (!text) @@ -2698,6 +2699,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) } self->head = self->global = self->depth = self->cycles = 0; self->length = PyList_GET_SIZE(self->text); + self->skip_style_tags = skip_style_tags; return Tokenizer_parse(self, context, 1); } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 4b28e02..ef5acd6 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -223,6 +223,7 @@ typedef struct { int global; /* global context */ int depth; /* stack recursion depth */ int cycles; /* total number of stack recursions */ + int skip_style_tags; /* temporary fix for the sometimes broken tag parser */ } Tokenizer; diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 35a2b09..8d12b62 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1124,7 +1124,7 @@ class Tokenizer(object): self._emit_text("<") elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() - elif this == next == "'": + elif this == next == "'" and not self._skip_style_tags: result = self._parse_style() if result is not None: return result @@ -1141,8 +1141,9 @@ class Tokenizer(object): self._emit_text(this) self._head += 1 - def tokenize(self, text, context=0): + def tokenize(self, text, context=0, skip_style_tags=False): """Build a list of tokens from a string of wikicode and return it.""" + self._skip_style_tags = skip_style_tags split = self.regex.split(text) self._text = [segment for segment in split if segment] self._head = self._global = self._depth = self._cycles = 0 diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py index d6d92f1..38350d8 100644 --- a/tests/_test_tree_equality.py +++ b/tests/_test_tree_equality.py @@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase): self.assertEqual(exp_attr.pad_first, act_attr.pad_first) self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq) self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq) - self.assertIs(expected.wiki_markup, actual.wiki_markup) + self.assertEqual(expected.wiki_markup, actual.wiki_markup) self.assertIs(expected.self_closing, actual.self_closing) self.assertIs(expected.invalid, actual.invalid) self.assertIs(expected.implicit, actual.implicit) diff --git a/tests/test_parser.py b/tests/test_parser.py index 5c50b01..672cbff 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -24,7 +24,7 @@ from __future__ import unicode_literals import unittest from mwparserfromhell import parser -from mwparserfromhell.nodes import Template, Text, Wikilink +from mwparserfromhell.nodes import Tag, Template, Text, Wikilink from mwparserfromhell.nodes.extras import Parameter from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext @@ -35,10 +35,12 @@ class TestParser(TreeEqualityTestCase): def test_use_c(self): """make sure the correct tokenizer is used""" + restore = parser.use_c if parser.use_c: self.assertTrue(parser.Parser()._tokenizer.USES_C) parser.use_c = False self.assertFalse(parser.Parser()._tokenizer.USES_C) + parser.use_c = restore def test_parsing(self): """integration test for parsing overall""" @@ -62,5 +64,26 @@ class TestParser(TreeEqualityTestCase): actual = parser.Parser().parse(text) self.assertWikicodeEqual(expected, actual) + def test_skip_style_tags(self): + """test Parser.parse(skip_style_tags=True)""" + def test(): + with_style = parser.Parser().parse(text, skip_style_tags=False) + without_style = parser.Parser().parse(text, skip_style_tags=True) + self.assertWikicodeEqual(a, with_style) + self.assertWikicodeEqual(b, without_style) + + text = "This is an example with ''italics''!" + a = wrap([Text("This is an example with "), + Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"), + Text("!")]) + b = wraptext("This is an example with ''italics''!") + + restore = parser.use_c + if parser.use_c: + test() + parser.use_c = False + test() + parser.use_c = restore + if __name__ == "__main__": unittest.main(verbosity=2)