@@ -9,6 +9,9 @@ v0.4 (unreleased): | |||||
- Wikicode.get_sections() now returns sections in the correct order. | - Wikicode.get_sections() now returns sections in the correct order. | ||||
- Wikicode.matches() now accepts a tuple or list of strings/Wikicode objects | - Wikicode.matches() now accepts a tuple or list of strings/Wikicode objects | ||||
instead of just a single string or Wikicode. | instead of just a single string or Wikicode. | ||||
- Given the frequency of issues with the (admittedly insufficient) tag parser, | |||||
there's a temporary skip_style_tags argument to parse() that ignores '' and | |||||
''' until these issues are corrected. | |||||
- C code cleanup and speed improvements. | - C code cleanup and speed improvements. | ||||
v0.3.2 (released September 1, 2013): | v0.3.2 (released September 1, 2013): | ||||
@@ -19,6 +19,10 @@ Unreleased | |||||
- :py:meth:`.Wikicode.matches` now accepts a tuple or list of | - :py:meth:`.Wikicode.matches` now accepts a tuple or list of | ||||
strings/:py:class:`.Wikicode` objects instead of just a single string or | strings/:py:class:`.Wikicode` objects instead of just a single string or | ||||
:py:class:`.Wikicode`. | :py:class:`.Wikicode`. | ||||
- Given the frequency of issues with the (admittedly insufficient) tag parser, | |||||
there's a temporary *skip_style_tags* argument to | |||||
:py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until | |||||
these issues are corrected. | |||||
- C code cleanup and speed improvements. | - C code cleanup and speed improvements. | ||||
v0.3.2 | v0.3.2 | ||||
@@ -53,8 +53,12 @@ class Parser(object): | |||||
self._tokenizer = Tokenizer() | self._tokenizer = Tokenizer() | ||||
self._builder = Builder() | self._builder = Builder() | ||||
def parse(self, text, context=0): | |||||
"""Parse *text*, returning a :py:class:`~.Wikicode` object tree.""" | |||||
tokens = self._tokenizer.tokenize(text, context) | |||||
def parse(self, text, context=0, skip_style_tags=False): | |||||
"""Parse *text*, returning a :py:class:`~.Wikicode` object tree. | |||||
If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be | |||||
parsed, but instead be treated as plain text. | |||||
""" | |||||
tokens = self._tokenizer.tokenize(text, context, skip_style_tags) | |||||
code = self._builder.build(tokens) | code = self._builder.build(tokens) | ||||
return code | return code |
@@ -2640,7 +2640,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
} | } | ||||
else if (this == '>' && this_context & LC_TAG_CLOSE) | else if (this == '>' && this_context & LC_TAG_CLOSE) | ||||
return Tokenizer_handle_tag_close_close(self); | return Tokenizer_handle_tag_close_close(self); | ||||
else if (this == next && next == '\'') { | |||||
else if (this == next && next == '\'' && !self->skip_style_tags) { | |||||
temp = Tokenizer_parse_style(self); | temp = Tokenizer_parse_style(self); | ||||
if (temp != Py_None) | if (temp != Py_None) | ||||
return temp; | return temp; | ||||
@@ -2675,9 +2675,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | ||||
{ | { | ||||
PyObject *text, *temp; | PyObject *text, *temp; | ||||
int context = 0; | |||||
int context = 0, skip_style_tags = 0; | |||||
if (PyArg_ParseTuple(args, "U|i", &text, &context)) { | |||||
if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { | |||||
Py_XDECREF(self->text); | Py_XDECREF(self->text); | ||||
self->text = PySequence_Fast(text, "expected a sequence"); | self->text = PySequence_Fast(text, "expected a sequence"); | ||||
} | } | ||||
@@ -2686,7 +2686,8 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
Py_ssize_t size; | Py_ssize_t size; | ||||
/* Failed to parse a Unicode object; try a string instead. */ | /* Failed to parse a Unicode object; try a string instead. */ | ||||
PyErr_Clear(); | PyErr_Clear(); | ||||
if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context)) | |||||
if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context, | |||||
&skip_style_tags)) | |||||
return NULL; | return NULL; | ||||
temp = PyUnicode_FromStringAndSize(encoded, size); | temp = PyUnicode_FromStringAndSize(encoded, size); | ||||
if (!text) | if (!text) | ||||
@@ -2698,6 +2699,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
} | } | ||||
self->head = self->global = self->depth = self->cycles = 0; | self->head = self->global = self->depth = self->cycles = 0; | ||||
self->length = PyList_GET_SIZE(self->text); | self->length = PyList_GET_SIZE(self->text); | ||||
self->skip_style_tags = skip_style_tags; | |||||
return Tokenizer_parse(self, context, 1); | return Tokenizer_parse(self, context, 1); | ||||
} | } | ||||
@@ -223,6 +223,7 @@ typedef struct { | |||||
int global; /* global context */ | int global; /* global context */ | ||||
int depth; /* stack recursion depth */ | int depth; /* stack recursion depth */ | ||||
int cycles; /* total number of stack recursions */ | int cycles; /* total number of stack recursions */ | ||||
int skip_style_tags; /* temporary fix for the sometimes broken tag parser */ | |||||
} Tokenizer; | } Tokenizer; | ||||
@@ -1124,7 +1124,7 @@ class Tokenizer(object): | |||||
self._emit_text("<") | self._emit_text("<") | ||||
elif this == ">" and self._context & contexts.TAG_CLOSE: | elif this == ">" and self._context & contexts.TAG_CLOSE: | ||||
return self._handle_tag_close_close() | return self._handle_tag_close_close() | ||||
elif this == next == "'": | |||||
elif this == next == "'" and not self._skip_style_tags: | |||||
result = self._parse_style() | result = self._parse_style() | ||||
if result is not None: | if result is not None: | ||||
return result | return result | ||||
@@ -1141,8 +1141,9 @@ class Tokenizer(object): | |||||
self._emit_text(this) | self._emit_text(this) | ||||
self._head += 1 | self._head += 1 | ||||
def tokenize(self, text, context=0): | |||||
def tokenize(self, text, context=0, skip_style_tags=False): | |||||
"""Build a list of tokens from a string of wikicode and return it.""" | """Build a list of tokens from a string of wikicode and return it.""" | ||||
self._skip_style_tags = skip_style_tags | |||||
split = self.regex.split(text) | split = self.regex.split(text) | ||||
self._text = [segment for segment in split if segment] | self._text = [segment for segment in split if segment] | ||||
self._head = self._global = self._depth = self._cycles = 0 | self._head = self._global = self._depth = self._cycles = 0 | ||||
@@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase): | |||||
self.assertEqual(exp_attr.pad_first, act_attr.pad_first) | self.assertEqual(exp_attr.pad_first, act_attr.pad_first) | ||||
self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq) | self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq) | ||||
self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq) | self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq) | ||||
self.assertIs(expected.wiki_markup, actual.wiki_markup) | |||||
self.assertEqual(expected.wiki_markup, actual.wiki_markup) | |||||
self.assertIs(expected.self_closing, actual.self_closing) | self.assertIs(expected.self_closing, actual.self_closing) | ||||
self.assertIs(expected.invalid, actual.invalid) | self.assertIs(expected.invalid, actual.invalid) | ||||
self.assertIs(expected.implicit, actual.implicit) | self.assertIs(expected.implicit, actual.implicit) | ||||
@@ -24,7 +24,7 @@ from __future__ import unicode_literals | |||||
import unittest | import unittest | ||||
from mwparserfromhell import parser | from mwparserfromhell import parser | ||||
from mwparserfromhell.nodes import Template, Text, Wikilink | |||||
from mwparserfromhell.nodes import Tag, Template, Text, Wikilink | |||||
from mwparserfromhell.nodes.extras import Parameter | from mwparserfromhell.nodes.extras import Parameter | ||||
from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext | from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext | ||||
@@ -35,10 +35,12 @@ class TestParser(TreeEqualityTestCase): | |||||
def test_use_c(self): | def test_use_c(self): | ||||
"""make sure the correct tokenizer is used""" | """make sure the correct tokenizer is used""" | ||||
restore = parser.use_c | |||||
if parser.use_c: | if parser.use_c: | ||||
self.assertTrue(parser.Parser()._tokenizer.USES_C) | self.assertTrue(parser.Parser()._tokenizer.USES_C) | ||||
parser.use_c = False | parser.use_c = False | ||||
self.assertFalse(parser.Parser()._tokenizer.USES_C) | self.assertFalse(parser.Parser()._tokenizer.USES_C) | ||||
parser.use_c = restore | |||||
def test_parsing(self): | def test_parsing(self): | ||||
"""integration test for parsing overall""" | """integration test for parsing overall""" | ||||
@@ -62,5 +64,26 @@ class TestParser(TreeEqualityTestCase): | |||||
actual = parser.Parser().parse(text) | actual = parser.Parser().parse(text) | ||||
self.assertWikicodeEqual(expected, actual) | self.assertWikicodeEqual(expected, actual) | ||||
def test_skip_style_tags(self): | |||||
"""test Parser.parse(skip_style_tags=True)""" | |||||
def test(): | |||||
with_style = parser.Parser().parse(text, skip_style_tags=False) | |||||
without_style = parser.Parser().parse(text, skip_style_tags=True) | |||||
self.assertWikicodeEqual(a, with_style) | |||||
self.assertWikicodeEqual(b, without_style) | |||||
text = "This is an example with ''italics''!" | |||||
a = wrap([Text("This is an example with "), | |||||
Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"), | |||||
Text("!")]) | |||||
b = wraptext("This is an example with ''italics''!") | |||||
restore = parser.use_c | |||||
if parser.use_c: | |||||
test() | |||||
parser.use_c = False | |||||
test() | |||||
parser.use_c = restore | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
unittest.main(verbosity=2) | unittest.main(verbosity=2) |