Browse Source

Add a temporary skip_style_tags until we resolve some issues.

tags/v0.3.3
Ben Kurtovic 10 years ago
parent
commit
1946cf621d
8 changed files with 49 additions and 11 deletions
  1. +3
    -0
      CHANGELOG
  2. +4
    -0
      docs/changelog.rst
  3. +7
    -3
      mwparserfromhell/parser/__init__.py
  4. +6
    -4
      mwparserfromhell/parser/tokenizer.c
  5. +1
    -0
      mwparserfromhell/parser/tokenizer.h
  6. +3
    -2
      mwparserfromhell/parser/tokenizer.py
  7. +1
    -1
      tests/_test_tree_equality.py
  8. +24
    -1
      tests/test_parser.py

+ 3
- 0
CHANGELOG View File

@@ -9,6 +9,9 @@ v0.4 (unreleased):
- Wikicode.get_sections() now returns sections in the correct order. - Wikicode.get_sections() now returns sections in the correct order.
- Wikicode.matches() now accepts a tuple or list of strings/Wikicode objects - Wikicode.matches() now accepts a tuple or list of strings/Wikicode objects
instead of just a single string or Wikicode. instead of just a single string or Wikicode.
- Given the frequency of issues with the (admittedly insufficient) tag parser,
there's a temporary skip_style_tags argument to parse() that ignores '' and
''' until these issues are corrected.
- C code cleanup and speed improvements. - C code cleanup and speed improvements.


v0.3.2 (released September 1, 2013): v0.3.2 (released September 1, 2013):


+ 4
- 0
docs/changelog.rst View File

@@ -19,6 +19,10 @@ Unreleased
- :py:meth:`.Wikicode.matches` now accepts a tuple or list of - :py:meth:`.Wikicode.matches` now accepts a tuple or list of
strings/:py:class:`.Wikicode` objects instead of just a single string or strings/:py:class:`.Wikicode` objects instead of just a single string or
:py:class:`.Wikicode`. :py:class:`.Wikicode`.
- Given the frequency of issues with the (admittedly insufficient) tag parser,
there's a temporary *skip_style_tags* argument to
:py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until
these issues are corrected.
- C code cleanup and speed improvements. - C code cleanup and speed improvements.


v0.3.2 v0.3.2


+ 7
- 3
mwparserfromhell/parser/__init__.py View File

@@ -53,8 +53,12 @@ class Parser(object):
self._tokenizer = Tokenizer() self._tokenizer = Tokenizer()
self._builder = Builder() self._builder = Builder()


def parse(self, text, context=0):
"""Parse *text*, returning a :py:class:`~.Wikicode` object tree."""
tokens = self._tokenizer.tokenize(text, context)
def parse(self, text, context=0, skip_style_tags=False):
"""Parse *text*, returning a :py:class:`~.Wikicode` object tree.

If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be
parsed, but instead be treated as plain text.
"""
tokens = self._tokenizer.tokenize(text, context, skip_style_tags)
code = self._builder.build(tokens) code = self._builder.build(tokens)
return code return code

+ 6
- 4
mwparserfromhell/parser/tokenizer.c View File

@@ -2640,7 +2640,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
} }
else if (this == '>' && this_context & LC_TAG_CLOSE) else if (this == '>' && this_context & LC_TAG_CLOSE)
return Tokenizer_handle_tag_close_close(self); return Tokenizer_handle_tag_close_close(self);
else if (this == next && next == '\'') {
else if (this == next && next == '\'' && !self->skip_style_tags) {
temp = Tokenizer_parse_style(self); temp = Tokenizer_parse_style(self);
if (temp != Py_None) if (temp != Py_None)
return temp; return temp;
@@ -2675,9 +2675,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{ {
PyObject *text, *temp; PyObject *text, *temp;
int context = 0;
int context = 0, skip_style_tags = 0;


if (PyArg_ParseTuple(args, "U|i", &text, &context)) {
if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
Py_XDECREF(self->text); Py_XDECREF(self->text);
self->text = PySequence_Fast(text, "expected a sequence"); self->text = PySequence_Fast(text, "expected a sequence");
} }
@@ -2686,7 +2686,8 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
Py_ssize_t size; Py_ssize_t size;
/* Failed to parse a Unicode object; try a string instead. */ /* Failed to parse a Unicode object; try a string instead. */
PyErr_Clear(); PyErr_Clear();
if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context))
if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
&skip_style_tags))
return NULL; return NULL;
temp = PyUnicode_FromStringAndSize(encoded, size); temp = PyUnicode_FromStringAndSize(encoded, size);
if (!text) if (!text)
@@ -2698,6 +2699,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
} }
self->head = self->global = self->depth = self->cycles = 0; self->head = self->global = self->depth = self->cycles = 0;
self->length = PyList_GET_SIZE(self->text); self->length = PyList_GET_SIZE(self->text);
self->skip_style_tags = skip_style_tags;
return Tokenizer_parse(self, context, 1); return Tokenizer_parse(self, context, 1);
} }




+ 1
- 0
mwparserfromhell/parser/tokenizer.h View File

@@ -223,6 +223,7 @@ typedef struct {
int global; /* global context */ int global; /* global context */
int depth; /* stack recursion depth */ int depth; /* stack recursion depth */
int cycles; /* total number of stack recursions */ int cycles; /* total number of stack recursions */
int skip_style_tags; /* temporary fix for the sometimes broken tag parser */
} Tokenizer; } Tokenizer;






+ 3
- 2
mwparserfromhell/parser/tokenizer.py View File

@@ -1124,7 +1124,7 @@ class Tokenizer(object):
self._emit_text("<") self._emit_text("<")
elif this == ">" and self._context & contexts.TAG_CLOSE: elif this == ">" and self._context & contexts.TAG_CLOSE:
return self._handle_tag_close_close() return self._handle_tag_close_close()
elif this == next == "'":
elif this == next == "'" and not self._skip_style_tags:
result = self._parse_style() result = self._parse_style()
if result is not None: if result is not None:
return result return result
@@ -1141,8 +1141,9 @@ class Tokenizer(object):
self._emit_text(this) self._emit_text(this)
self._head += 1 self._head += 1


def tokenize(self, text, context=0):
def tokenize(self, text, context=0, skip_style_tags=False):
"""Build a list of tokens from a string of wikicode and return it.""" """Build a list of tokens from a string of wikicode and return it."""
self._skip_style_tags = skip_style_tags
split = self.regex.split(text) split = self.regex.split(text)
self._text = [segment for segment in split if segment] self._text = [segment for segment in split if segment]
self._head = self._global = self._depth = self._cycles = 0 self._head = self._global = self._depth = self._cycles = 0


+ 1
- 1
tests/_test_tree_equality.py View File

@@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase):
self.assertEqual(exp_attr.pad_first, act_attr.pad_first) self.assertEqual(exp_attr.pad_first, act_attr.pad_first)
self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq) self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq)
self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq) self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq)
self.assertIs(expected.wiki_markup, actual.wiki_markup)
self.assertEqual(expected.wiki_markup, actual.wiki_markup)
self.assertIs(expected.self_closing, actual.self_closing) self.assertIs(expected.self_closing, actual.self_closing)
self.assertIs(expected.invalid, actual.invalid) self.assertIs(expected.invalid, actual.invalid)
self.assertIs(expected.implicit, actual.implicit) self.assertIs(expected.implicit, actual.implicit)


+ 24
- 1
tests/test_parser.py View File

@@ -24,7 +24,7 @@ from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell import parser from mwparserfromhell import parser
from mwparserfromhell.nodes import Template, Text, Wikilink
from mwparserfromhell.nodes import Tag, Template, Text, Wikilink
from mwparserfromhell.nodes.extras import Parameter from mwparserfromhell.nodes.extras import Parameter


from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -35,10 +35,12 @@ class TestParser(TreeEqualityTestCase):


def test_use_c(self): def test_use_c(self):
"""make sure the correct tokenizer is used""" """make sure the correct tokenizer is used"""
restore = parser.use_c
if parser.use_c: if parser.use_c:
self.assertTrue(parser.Parser()._tokenizer.USES_C) self.assertTrue(parser.Parser()._tokenizer.USES_C)
parser.use_c = False parser.use_c = False
self.assertFalse(parser.Parser()._tokenizer.USES_C) self.assertFalse(parser.Parser()._tokenizer.USES_C)
parser.use_c = restore


def test_parsing(self): def test_parsing(self):
"""integration test for parsing overall""" """integration test for parsing overall"""
@@ -62,5 +64,26 @@ class TestParser(TreeEqualityTestCase):
actual = parser.Parser().parse(text) actual = parser.Parser().parse(text)
self.assertWikicodeEqual(expected, actual) self.assertWikicodeEqual(expected, actual)


def test_skip_style_tags(self):
"""test Parser.parse(skip_style_tags=True)"""
def test():
with_style = parser.Parser().parse(text, skip_style_tags=False)
without_style = parser.Parser().parse(text, skip_style_tags=True)
self.assertWikicodeEqual(a, with_style)
self.assertWikicodeEqual(b, without_style)

text = "This is an example with ''italics''!"
a = wrap([Text("This is an example with "),
Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"),
Text("!")])
b = wraptext("This is an example with ''italics''!")

restore = parser.use_c
if parser.use_c:
test()
parser.use_c = False
test()
parser.use_c = restore

if __name__ == "__main__": if __name__ == "__main__":
unittest.main(verbosity=2) unittest.main(verbosity=2)

Loading…
Cancel
Save