Browse Source

Raise ParserError for internal problems. Improve coverage. Cleanup.

tags/v0.4
Ben Kurtovic 9 years ago
parent
commit
08cafc0576
13 changed files with 113 additions and 32 deletions
  1. +4
    -0
      .coveragerc
  2. +2
    -0
      .gitignore
  3. +3
    -0
      CHANGELOG
  4. +3
    -0
      docs/changelog.rst
  5. +2
    -2
      mwparserfromhell/nodes/__init__.py
  6. +27
    -4
      mwparserfromhell/parser/__init__.py
  7. +10
    -13
      mwparserfromhell/parser/builder.py
  8. +45
    -8
      mwparserfromhell/parser/tokenizer.c
  9. +3
    -0
      mwparserfromhell/parser/tokenizer.h
  10. +5
    -2
      mwparserfromhell/parser/tokenizer.py
  11. +1
    -1
      mwparserfromhell/parser/tokens.py
  12. +1
    -1
      mwparserfromhell/utils.py
  13. +7
    -1
      tests/test_builder.py

+ 4
- 0
.coveragerc View File

@@ -0,0 +1,4 @@
[report]
exclude_lines =
pragma: no cover
raise NotImplementedError()

+ 2
- 0
.gitignore View File

@@ -4,9 +4,11 @@
*.dll *.dll
*.egg *.egg
*.egg-info *.egg-info
.coverage
.DS_Store .DS_Store
__pycache__ __pycache__
build build
dist dist
docs/_build docs/_build
scripts/*.log scripts/*.log
htmlcov/

+ 3
- 0
CHANGELOG View File

@@ -10,6 +10,9 @@ v0.4 (unreleased):
option, RECURSE_OTHERS, which recurses over all children except instances of option, RECURSE_OTHERS, which recurses over all children except instances of
'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` 'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)`
returns all un-nested templates). returns all un-nested templates).
- If something goes wrong while parsing, ParserError will now be raised.
Previously, the parser would produce an unclear BadRoute exception or allow
an incorrect node tree to be build.
- Fixed a parser bug involving nested tags. - Fixed a parser bug involving nested tags.
- Updated and fixed some documentation. - Updated and fixed some documentation.




+ 3
- 0
docs/changelog.rst View File

@@ -18,6 +18,9 @@ Unreleased
which recurses over all children except instances of *forcetype* (for which recurses over all children except instances of *forcetype* (for
example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested
templates). templates).
- If something goes wrong while parsing, :py:exc:`.ParserError` will now be
raised. Previously, the parser would produce an unclear :py:exc:`.BadRoute`
exception or allow an incorrect node tree to be build.
- Fixed a parser bug involving nested tags. - Fixed a parser bug involving nested tags.
- Updated and fixed some documentation. - Updated and fixed some documentation.




+ 2
- 2
mwparserfromhell/nodes/__init__.py View File

@@ -55,8 +55,8 @@ class Node(StringMixIn):
raise NotImplementedError() raise NotImplementedError()


def __children__(self): def __children__(self):
return # Funny generator-that-yields-nothing syntax
yield
return
yield # pragma: no cover (this is a generator that yields nothing)


def __strip__(self, normalize, collapse): def __strip__(self, normalize, collapse):
return None return None


+ 27
- 4
mwparserfromhell/parser/__init__.py View File

@@ -26,6 +26,19 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module
joins them together under one interface. joins them together under one interface.
""" """


class ParserError(Exception):
"""Exception raised when an internal error occurs while parsing.

This does not mean that the wikicode was invalid, because invalid markup
should still be parsed correctly. This means that the parser caught itself
with an impossible internal state and is bailing out before other problems
can happen. Its appearance indicates a bug.
"""
def __init__(self, extra):
msg = "This is a bug and should be reported. Info: {0}.".format(extra)
super(ParserError, self).__init__(msg)


from .builder import Builder from .builder import Builder
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
try: try:
@@ -35,15 +48,22 @@ except ImportError:
CTokenizer = None CTokenizer = None
use_c = False use_c = False


__all__ = ["use_c", "Parser"]
__all__ = ["use_c", "Parser", "ParserError"]


class Parser(object): class Parser(object):
"""Represents a parser for wikicode. """Represents a parser for wikicode.


Actual parsing is a two-step process: first, the text is split up into a Actual parsing is a two-step process: first, the text is split up into a
series of tokens by the :py:class:`~.Tokenizer`, and then the tokens are
converted into trees of :py:class:`~.Wikicode` objects and
:py:class:`~.Node`\ s by the :py:class:`~.Builder`.
series of tokens by the :py:class:`.Tokenizer`, and then the tokens are
converted into trees of :py:class:`.Wikicode` objects and
:py:class:`.Node`\ s by the :py:class:`.Builder`.

Instances of this class or its dependents (:py:class:`.Tokenizer` and
:py:class:`.Builder`) should not be shared between threads.
:py:meth:`parse` can be called multiple times as long as it is not done
concurrently. In general, there is no need to do this because parsing
should be done through :py:func:`mwparserfromhell.parse`, which creates a
new :py:class:`.Parser` object as necessary.
""" """


def __init__(self): def __init__(self):
@@ -65,6 +85,9 @@ class Parser(object):


If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be
parsed, but instead will be treated as plain text. parsed, but instead will be treated as plain text.

If there is an internal error while parsing, :py:exc:`.ParserError`
will be raised.
""" """
tokens = self._tokenizer.tokenize(text, context, skip_style_tags) tokens = self._tokenizer.tokenize(text, context, skip_style_tags)
code = self._builder.build(tokens) code = self._builder.build(tokens)


+ 10
- 13
mwparserfromhell/parser/builder.py View File

@@ -22,7 +22,7 @@


from __future__ import unicode_literals from __future__ import unicode_literals


from . import tokens
from . import tokens, ParserError
from ..compat import str from ..compat import str
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
Template, Text, Wikilink) Template, Text, Wikilink)
@@ -33,33 +33,28 @@ from ..wikicode import Wikicode
__all__ = ["Builder"] __all__ = ["Builder"]


class Builder(object): class Builder(object):
"""Combines a sequence of tokens into a tree of ``Wikicode`` objects.
"""Builds a tree of nodes out of a sequence of tokens.


To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build` To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build`
method. The list will be exhausted as it is parsed and a method. The list will be exhausted as it is parsed and a
:py:class:`~.Wikicode` object will be returned.
:py:class:`.Wikicode` object containing the node tree will be returned.
""" """


def __init__(self): def __init__(self):
self._tokens = [] self._tokens = []
self._stacks = [] self._stacks = []


def _wrap(self, nodes):
"""Properly wrap a list of nodes in a ``Wikicode`` object."""
return Wikicode(SmartList(nodes))

def _push(self): def _push(self):
"""Push a new node list onto the stack.""" """Push a new node list onto the stack."""
self._stacks.append([]) self._stacks.append([])


def _pop(self, wrap=True):
def _pop(self):
"""Pop the current node list off of the stack. """Pop the current node list off of the stack.


If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list.
The raw node list is wrapped in a :py:class:`.SmartList` and then in a
:py:class:`.Wikicode` object.
""" """
if wrap:
return self._wrap(self._stacks.pop())
return self._stacks.pop()
return Wikicode(SmartList(self._stacks.pop()))


def _write(self, item): def _write(self, item):
"""Append a node to the current node list.""" """Append a node to the current node list."""
@@ -84,7 +79,7 @@ class Builder(object):
self._tokens.append(token) self._tokens.append(token)
value = self._pop() value = self._pop()
if key is None: if key is None:
key = self._wrap([Text(str(default))])
key = Wikicode(SmartList([Text(str(default))]))
return Parameter(key, value, showkey) return Parameter(key, value, showkey)
else: else:
self._write(self._handle_token(token)) self._write(self._handle_token(token))
@@ -270,6 +265,8 @@ class Builder(object):
return self._handle_comment() return self._handle_comment()
elif isinstance(token, tokens.TagOpenOpen): elif isinstance(token, tokens.TagOpenOpen):
return self._handle_tag(token) return self._handle_tag(token)
err = "_handle_token() got unexpected {0}".format(type(token).__name__)
raise ParserError(err)


def build(self, tokenlist): def build(self, tokenlist):
"""Build a Wikicode object from a list tokens and return it.""" """Build a Wikicode object from a list tokens and return it."""


+ 45
- 8
mwparserfromhell/parser/tokenizer.c View File

@@ -347,7 +347,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)


/* /*
Fail the current tokenization route. Discards the current Fail the current tokenization route. Discards the current
stack/context/textbuffer and raises a BadRoute exception.
stack/context/textbuffer and sets the BAD_ROUTE flag.
*/ */
static void* Tokenizer_fail_route(Tokenizer* self) static void* Tokenizer_fail_route(Tokenizer* self)
{ {
@@ -2681,7 +2681,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
*/ */
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{ {
PyObject *text, *temp;
PyObject *text, *temp, *tokens;
int context = 0, skip_style_tags = 0; int context = 0, skip_style_tags = 0;


if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
@@ -2704,13 +2704,29 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
Py_XDECREF(temp); Py_XDECREF(temp);
self->text = text; self->text = text;
} }

self->head = self->global = self->depth = self->cycles = 0; self->head = self->global = self->depth = self->cycles = 0;
self->length = PyList_GET_SIZE(self->text); self->length = PyList_GET_SIZE(self->text);
self->skip_style_tags = skip_style_tags; self->skip_style_tags = skip_style_tags;
return Tokenizer_parse(self, context, 1);
tokens = Tokenizer_parse(self, context, 1);

if (!tokens && !PyErr_Occurred()) {
if (!ParserError) {
if (load_exceptions())
return NULL;
}
if (BAD_ROUTE) {
RESET_ROUTE();
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
}
else
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
return NULL;
}
return tokens;
} }


static int load_entitydefs(void)
static int load_entities(void)
{ {
PyObject *tempmod, *defmap, *deflist; PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i; unsigned numdefs, i;
@@ -2814,7 +2830,7 @@ static int load_tokens(void)
return 0; return 0;
} }


static int load_definitions(void)
static int load_defs(void)
{ {
PyObject *tempmod, PyObject *tempmod,
*globals = PyEval_GetGlobals(), *globals = PyEval_GetGlobals(),
@@ -2835,6 +2851,29 @@ static int load_definitions(void)
return 0; return 0;
} }


static int load_exceptions(void)
{
PyObject *tempmod, *parsermod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("parser");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
return -1;
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
return -1;
parsermod = PyObject_GetAttrString(tempmod, "parser");
Py_DECREF(tempmod);
ParserError = PyObject_GetAttrString(parsermod, "ParserError");
Py_DECREF(parsermod);
return 0;
}

PyMODINIT_FUNC INIT_FUNC_NAME(void) PyMODINIT_FUNC INIT_FUNC_NAME(void)
{ {
PyObject *module; PyObject *module;
@@ -2851,9 +2890,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void)
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
EMPTY = PyUnicode_FromString(""); EMPTY = PyUnicode_FromString("");
NOARGS = PyTuple_New(0); NOARGS = PyTuple_New(0);
if (!EMPTY || !NOARGS)
INIT_ERROR;
if (load_entitydefs() || load_tokens() || load_definitions())
if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs())
INIT_ERROR; INIT_ERROR;
#ifdef IS_PY3K #ifdef IS_PY3K
return module; return module;


+ 3
- 0
mwparserfromhell/parser/tokenizer.h View File

@@ -62,6 +62,7 @@ static char** entitydefs;


static PyObject* EMPTY; static PyObject* EMPTY;
static PyObject* NOARGS; static PyObject* NOARGS;
static PyObject* ParserError;
static PyObject* definitions; static PyObject* definitions;




@@ -268,6 +269,8 @@ static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int); static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);


static int load_exceptions(void);



/* Macros for Python 2/3 compatibility: */ /* Macros for Python 2/3 compatibility: */




+ 5
- 2
mwparserfromhell/parser/tokenizer.py View File

@@ -24,7 +24,7 @@ from __future__ import unicode_literals
from math import log from math import log
import re import re


from . import contexts, tokens
from . import contexts, tokens, ParserError
from ..compat import htmlentities, range from ..compat import htmlentities, range
from ..definitions import (get_html_tag, is_parsable, is_single, from ..definitions import (get_html_tag, is_parsable, is_single,
is_single_only, is_scheme) is_single_only, is_scheme)
@@ -1154,4 +1154,7 @@ class Tokenizer(object):
split = self.regex.split(text) split = self.regex.split(text)
self._text = [segment for segment in split if segment] self._text = [segment for segment in split if segment]
self._head = self._global = self._depth = self._cycles = 0 self._head = self._global = self._depth = self._cycles = 0
return self._parse(context)
try:
return self._parse(context)
except BadRoute: # pragma: no cover (untestable/exceptional case)
raise ParserError("Python tokenizer exited with BadRoute")

+ 1
- 1
mwparserfromhell/parser/tokens.py View File

@@ -34,7 +34,7 @@ from ..compat import py3k, str


__all__ = ["Token"] __all__ = ["Token"]


class Token (dict):
class Token(dict):
"""A token stores the semantic meaning of a unit of wikicode.""" """A token stores the semantic meaning of a unit of wikicode."""


def __repr__(self): def __repr__(self):


+ 1
- 1
mwparserfromhell/utils.py View File

@@ -66,7 +66,7 @@ def parse_anything(value, context=0, skip_style_tags=False):
nodelist = SmartList() nodelist = SmartList()
for item in value: for item in value:
nodelist += parse_anything(item, context, skip_style_tags).nodes nodelist += parse_anything(item, context, skip_style_tags).nodes
return Wikicode(nodelist)
except TypeError: except TypeError:
error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}"
raise ValueError(error.format(type(value).__name__, value)) raise ValueError(error.format(type(value).__name__, value))
return Wikicode(nodelist)

+ 7
- 1
tests/test_builder.py View File

@@ -30,7 +30,7 @@ except ImportError:
from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading,
HTMLEntity, Tag, Template, Text, Wikilink) HTMLEntity, Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes.extras import Attribute, Parameter from mwparserfromhell.nodes.extras import Attribute, Parameter
from mwparserfromhell.parser import tokens
from mwparserfromhell.parser import tokens, ParserError
from mwparserfromhell.parser.builder import Builder from mwparserfromhell.parser.builder import Builder


from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -420,5 +420,11 @@ class TestBuilder(TreeEqualityTestCase):
named=True)]))])]) named=True)]))])])
self.assertWikicodeEqual(valid, self.builder.build(test)) self.assertWikicodeEqual(valid, self.builder.build(test))


def test_parser_error(self):
"""test whether ParserError gets thrown for bad input"""
msg = r"_handle_token\(\) got unexpected TemplateClose"
self.assertRaisesRegexp(
ParserError, msg, self.builder.build, [tokens.TemplateClose()])

if __name__ == "__main__": if __name__ == "__main__":
unittest.main(verbosity=2) unittest.main(verbosity=2)

Loading…
Cancel
Save