Parcourir la source

Add a temporary skip_style_tags until we resolve some issues.

tags/v0.3.3
Ben Kurtovic il y a 10 ans
Parent
révision
1946cf621d
8 fichiers modifiés avec 49 ajouts et 11 suppressions
  1. +3
    -0
      CHANGELOG
  2. +4
    -0
      docs/changelog.rst
  3. +7
    -3
      mwparserfromhell/parser/__init__.py
  4. +6
    -4
      mwparserfromhell/parser/tokenizer.c
  5. +1
    -0
      mwparserfromhell/parser/tokenizer.h
  6. +3
    -2
      mwparserfromhell/parser/tokenizer.py
  7. +1
    -1
      tests/_test_tree_equality.py
  8. +24
    -1
      tests/test_parser.py

+ 3
- 0
CHANGELOG Voir le fichier

@@ -9,6 +9,9 @@ v0.4 (unreleased):
- Wikicode.get_sections() now returns sections in the correct order.
- Wikicode.matches() now accepts a tuple or list of strings/Wikicode objects
instead of just a single string or Wikicode.
- Given the frequency of issues with the (admittedly insufficient) tag parser,
there's a temporary skip_style_tags argument to parse() that ignores '' and
''' until these issues are corrected.
- C code cleanup and speed improvements.

v0.3.2 (released September 1, 2013):


+ 4
- 0
docs/changelog.rst Voir le fichier

@@ -19,6 +19,10 @@ Unreleased
- :py:meth:`.Wikicode.matches` now accepts a tuple or list of
strings/:py:class:`.Wikicode` objects instead of just a single string or
:py:class:`.Wikicode`.
- Given the frequency of issues with the (admittedly insufficient) tag parser,
there's a temporary *skip_style_tags* argument to
:py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until
these issues are corrected.
- C code cleanup and speed improvements.

v0.3.2


+ 7
- 3
mwparserfromhell/parser/__init__.py Voir le fichier

@@ -53,8 +53,12 @@ class Parser(object):
self._tokenizer = Tokenizer()
self._builder = Builder()

def parse(self, text, context=0):
"""Parse *text*, returning a :py:class:`~.Wikicode` object tree."""
tokens = self._tokenizer.tokenize(text, context)
def parse(self, text, context=0, skip_style_tags=False):
"""Parse *text*, returning a :py:class:`~.Wikicode` object tree.

If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be
parsed, but instead be treated as plain text.
"""
tokens = self._tokenizer.tokenize(text, context, skip_style_tags)
code = self._builder.build(tokens)
return code

+ 6
- 4
mwparserfromhell/parser/tokenizer.c Voir le fichier

@@ -2640,7 +2640,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
}
else if (this == '>' && this_context & LC_TAG_CLOSE)
return Tokenizer_handle_tag_close_close(self);
else if (this == next && next == '\'') {
else if (this == next && next == '\'' && !self->skip_style_tags) {
temp = Tokenizer_parse_style(self);
if (temp != Py_None)
return temp;
@@ -2675,9 +2675,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{
PyObject *text, *temp;
int context = 0;
int context = 0, skip_style_tags = 0;

if (PyArg_ParseTuple(args, "U|i", &text, &context)) {
if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
Py_XDECREF(self->text);
self->text = PySequence_Fast(text, "expected a sequence");
}
@@ -2686,7 +2686,8 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
Py_ssize_t size;
/* Failed to parse a Unicode object; try a string instead. */
PyErr_Clear();
if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context))
if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
&skip_style_tags))
return NULL;
temp = PyUnicode_FromStringAndSize(encoded, size);
if (!text)
@@ -2698,6 +2699,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
}
self->head = self->global = self->depth = self->cycles = 0;
self->length = PyList_GET_SIZE(self->text);
self->skip_style_tags = skip_style_tags;
return Tokenizer_parse(self, context, 1);
}



+ 1
- 0
mwparserfromhell/parser/tokenizer.h Voir le fichier

@@ -223,6 +223,7 @@ typedef struct {
int global; /* global context */
int depth; /* stack recursion depth */
int cycles; /* total number of stack recursions */
int skip_style_tags; /* temporary fix for the sometimes broken tag parser */
} Tokenizer;




+ 3
- 2
mwparserfromhell/parser/tokenizer.py Voir le fichier

@@ -1124,7 +1124,7 @@ class Tokenizer(object):
self._emit_text("<")
elif this == ">" and self._context & contexts.TAG_CLOSE:
return self._handle_tag_close_close()
elif this == next == "'":
elif this == next == "'" and not self._skip_style_tags:
result = self._parse_style()
if result is not None:
return result
@@ -1141,8 +1141,9 @@ class Tokenizer(object):
self._emit_text(this)
self._head += 1

def tokenize(self, text, context=0):
def tokenize(self, text, context=0, skip_style_tags=False):
"""Build a list of tokens from a string of wikicode and return it."""
self._skip_style_tags = skip_style_tags
split = self.regex.split(text)
self._text = [segment for segment in split if segment]
self._head = self._global = self._depth = self._cycles = 0


+ 1
- 1
tests/_test_tree_equality.py Voir le fichier

@@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase):
self.assertEqual(exp_attr.pad_first, act_attr.pad_first)
self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq)
self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq)
self.assertIs(expected.wiki_markup, actual.wiki_markup)
self.assertEqual(expected.wiki_markup, actual.wiki_markup)
self.assertIs(expected.self_closing, actual.self_closing)
self.assertIs(expected.invalid, actual.invalid)
self.assertIs(expected.implicit, actual.implicit)


+ 24
- 1
tests/test_parser.py Voir le fichier

@@ -24,7 +24,7 @@ from __future__ import unicode_literals
import unittest

from mwparserfromhell import parser
from mwparserfromhell.nodes import Template, Text, Wikilink
from mwparserfromhell.nodes import Tag, Template, Text, Wikilink
from mwparserfromhell.nodes.extras import Parameter

from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -35,10 +35,12 @@ class TestParser(TreeEqualityTestCase):

def test_use_c(self):
"""make sure the correct tokenizer is used"""
restore = parser.use_c
if parser.use_c:
self.assertTrue(parser.Parser()._tokenizer.USES_C)
parser.use_c = False
self.assertFalse(parser.Parser()._tokenizer.USES_C)
parser.use_c = restore

def test_parsing(self):
"""integration test for parsing overall"""
@@ -62,5 +64,26 @@ class TestParser(TreeEqualityTestCase):
actual = parser.Parser().parse(text)
self.assertWikicodeEqual(expected, actual)

def test_skip_style_tags(self):
"""test Parser.parse(skip_style_tags=True)"""
def test():
with_style = parser.Parser().parse(text, skip_style_tags=False)
without_style = parser.Parser().parse(text, skip_style_tags=True)
self.assertWikicodeEqual(a, with_style)
self.assertWikicodeEqual(b, without_style)

text = "This is an example with ''italics''!"
a = wrap([Text("This is an example with "),
Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"),
Text("!")])
b = wraptext("This is an example with ''italics''!")

restore = parser.use_c
if parser.use_c:
test()
parser.use_c = False
test()
parser.use_c = restore

if __name__ == "__main__":
unittest.main(verbosity=2)

Chargement…
Annuler
Enregistrer