diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..0a92f19 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[report] +exclude_lines = + pragma: no cover + raise NotImplementedError() diff --git a/.gitignore b/.gitignore index f7f7bd9..3da2db3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,9 +4,11 @@ *.dll *.egg *.egg-info +.coverage .DS_Store __pycache__ build dist docs/_build scripts/*.log +htmlcov/ diff --git a/CHANGELOG b/CHANGELOG index 4f4f77b..d733cee 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -10,6 +10,9 @@ v0.4 (unreleased): option, RECURSE_OTHERS, which recurses over all children except instances of 'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` returns all un-nested templates). +- If something goes wrong while parsing, ParserError will now be raised. + Previously, the parser would produce an unclear BadRoute exception or allow + an incorrect node tree to be build. - Fixed a parser bug involving nested tags. - Updated and fixed some documentation. diff --git a/docs/changelog.rst b/docs/changelog.rst index 0576d29..a530733 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -18,6 +18,9 @@ Unreleased which recurses over all children except instances of *forcetype* (for example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested templates). +- If something goes wrong while parsing, :py:exc:`.ParserError` will now be + raised. Previously, the parser would produce an unclear :py:exc:`.BadRoute` + exception or allow an incorrect node tree to be build. - Fixed a parser bug involving nested tags. - Updated and fixed some documentation. diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index 223cc67..d6f60bd 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -55,8 +55,8 @@ class Node(StringMixIn): raise NotImplementedError() def __children__(self): - return # Funny generator-that-yields-nothing syntax - yield + return + yield # pragma: no cover (this is a generator that yields nothing) def __strip__(self, normalize, collapse): return None diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 8bac295..467d5df 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -26,6 +26,19 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module joins them together under one interface. """ +class ParserError(Exception): + """Exception raised when an internal error occurs while parsing. + + This does not mean that the wikicode was invalid, because invalid markup + should still be parsed correctly. This means that the parser caught itself + with an impossible internal state and is bailing out before other problems + can happen. Its appearance indicates a bug. + """ + def __init__(self, extra): + msg = "This is a bug and should be reported. Info: {0}.".format(extra) + super(ParserError, self).__init__(msg) + + from .builder import Builder from .tokenizer import Tokenizer try: @@ -35,15 +48,22 @@ except ImportError: CTokenizer = None use_c = False -__all__ = ["use_c", "Parser"] +__all__ = ["use_c", "Parser", "ParserError"] class Parser(object): """Represents a parser for wikicode. Actual parsing is a two-step process: first, the text is split up into a - series of tokens by the :py:class:`~.Tokenizer`, and then the tokens are - converted into trees of :py:class:`~.Wikicode` objects and - :py:class:`~.Node`\ s by the :py:class:`~.Builder`. + series of tokens by the :py:class:`.Tokenizer`, and then the tokens are + converted into trees of :py:class:`.Wikicode` objects and + :py:class:`.Node`\ s by the :py:class:`.Builder`. + + Instances of this class or its dependents (:py:class:`.Tokenizer` and + :py:class:`.Builder`) should not be shared between threads. + :py:meth:`parse` can be called multiple times as long as it is not done + concurrently. In general, there is no need to do this because parsing + should be done through :py:func:`mwparserfromhell.parse`, which creates a + new :py:class:`.Parser` object as necessary. """ def __init__(self): @@ -65,6 +85,9 @@ class Parser(object): If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be parsed, but instead will be treated as plain text. + + If there is an internal error while parsing, :py:exc:`.ParserError` + will be raised. """ tokens = self._tokenizer.tokenize(text, context, skip_style_tags) code = self._builder.build(tokens) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 5f8ce45..559bd54 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -22,7 +22,7 @@ from __future__ import unicode_literals -from . import tokens +from . import tokens, ParserError from ..compat import str from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, Template, Text, Wikilink) @@ -33,33 +33,28 @@ from ..wikicode import Wikicode __all__ = ["Builder"] class Builder(object): - """Combines a sequence of tokens into a tree of ``Wikicode`` objects. + """Builds a tree of nodes out of a sequence of tokens. To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build` method. The list will be exhausted as it is parsed and a - :py:class:`~.Wikicode` object will be returned. + :py:class:`.Wikicode` object containing the node tree will be returned. """ def __init__(self): self._tokens = [] self._stacks = [] - def _wrap(self, nodes): - """Properly wrap a list of nodes in a ``Wikicode`` object.""" - return Wikicode(SmartList(nodes)) - def _push(self): """Push a new node list onto the stack.""" self._stacks.append([]) - def _pop(self, wrap=True): + def _pop(self): """Pop the current node list off of the stack. - If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list. + The raw node list is wrapped in a :py:class:`.SmartList` and then in a + :py:class:`.Wikicode` object. """ - if wrap: - return self._wrap(self._stacks.pop()) - return self._stacks.pop() + return Wikicode(SmartList(self._stacks.pop())) def _write(self, item): """Append a node to the current node list.""" @@ -84,7 +79,7 @@ class Builder(object): self._tokens.append(token) value = self._pop() if key is None: - key = self._wrap([Text(str(default))]) + key = Wikicode(SmartList([Text(str(default))])) return Parameter(key, value, showkey) else: self._write(self._handle_token(token)) @@ -270,6 +265,8 @@ class Builder(object): return self._handle_comment() elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) + err = "_handle_token() got unexpected {0}".format(type(token).__name__) + raise ParserError(err) def build(self, tokenlist): """Build a Wikicode object from a list tokens and return it.""" diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 41ce5ac..6ab8570 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -347,7 +347,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) /* Fail the current tokenization route. Discards the current - stack/context/textbuffer and raises a BadRoute exception. + stack/context/textbuffer and sets the BAD_ROUTE flag. */ static void* Tokenizer_fail_route(Tokenizer* self) { @@ -2681,7 +2681,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) */ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { - PyObject *text, *temp; + PyObject *text, *temp, *tokens; int context = 0, skip_style_tags = 0; if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { @@ -2704,13 +2704,29 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) Py_XDECREF(temp); self->text = text; } + self->head = self->global = self->depth = self->cycles = 0; self->length = PyList_GET_SIZE(self->text); self->skip_style_tags = skip_style_tags; - return Tokenizer_parse(self, context, 1); + tokens = Tokenizer_parse(self, context, 1); + + if (!tokens && !PyErr_Occurred()) { + if (!ParserError) { + if (load_exceptions()) + return NULL; + } + if (BAD_ROUTE) { + RESET_ROUTE(); + PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); + } + else + PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); + return NULL; + } + return tokens; } -static int load_entitydefs(void) +static int load_entities(void) { PyObject *tempmod, *defmap, *deflist; unsigned numdefs, i; @@ -2814,7 +2830,7 @@ static int load_tokens(void) return 0; } -static int load_definitions(void) +static int load_defs(void) { PyObject *tempmod, *globals = PyEval_GetGlobals(), @@ -2835,6 +2851,29 @@ static int load_definitions(void) return 0; } +static int load_exceptions(void) +{ + PyObject *tempmod, *parsermod, + *globals = PyEval_GetGlobals(), + *locals = PyEval_GetLocals(), + *fromlist = PyList_New(1), + *modname = IMPORT_NAME_FUNC("parser"); + char *name = "mwparserfromhell"; + + if (!fromlist || !modname) + return -1; + PyList_SET_ITEM(fromlist, 0, modname); + tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); + Py_DECREF(fromlist); + if (!tempmod) + return -1; + parsermod = PyObject_GetAttrString(tempmod, "parser"); + Py_DECREF(tempmod); + ParserError = PyObject_GetAttrString(parsermod, "ParserError"); + Py_DECREF(parsermod); + return 0; +} + PyMODINIT_FUNC INIT_FUNC_NAME(void) { PyObject *module; @@ -2851,9 +2890,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); EMPTY = PyUnicode_FromString(""); NOARGS = PyTuple_New(0); - if (!EMPTY || !NOARGS) - INIT_ERROR; - if (load_entitydefs() || load_tokens() || load_definitions()) + if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs()) INIT_ERROR; #ifdef IS_PY3K return module; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 032480d..4312e2f 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -62,6 +62,7 @@ static char** entitydefs; static PyObject* EMPTY; static PyObject* NOARGS; +static PyObject* ParserError; static PyObject* definitions; @@ -268,6 +269,8 @@ static int Tokenizer_parse_tag(Tokenizer*); static PyObject* Tokenizer_parse(Tokenizer*, int, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); +static int load_exceptions(void); + /* Macros for Python 2/3 compatibility: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e69a823..9af9204 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -24,7 +24,7 @@ from __future__ import unicode_literals from math import log import re -from . import contexts, tokens +from . import contexts, tokens, ParserError from ..compat import htmlentities, range from ..definitions import (get_html_tag, is_parsable, is_single, is_single_only, is_scheme) @@ -1154,4 +1154,7 @@ class Tokenizer(object): split = self.regex.split(text) self._text = [segment for segment in split if segment] self._head = self._global = self._depth = self._cycles = 0 - return self._parse(context) + try: + return self._parse(context) + except BadRoute: # pragma: no cover (untestable/exceptional case) + raise ParserError("Python tokenizer exited with BadRoute") diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 40e5158..c7cc3ef 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -34,7 +34,7 @@ from ..compat import py3k, str __all__ = ["Token"] -class Token (dict): +class Token(dict): """A token stores the semantic meaning of a unit of wikicode.""" def __repr__(self): diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index fd54ad0..8dc5e4e 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -66,7 +66,7 @@ def parse_anything(value, context=0, skip_style_tags=False): nodelist = SmartList() for item in value: nodelist += parse_anything(item, context, skip_style_tags).nodes + return Wikicode(nodelist) except TypeError: error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" raise ValueError(error.format(type(value).__name__, value)) - return Wikicode(nodelist) diff --git a/tests/test_builder.py b/tests/test_builder.py index c8fdca3..ed306f7 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -30,7 +30,7 @@ except ImportError: from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter -from mwparserfromhell.parser import tokens +from mwparserfromhell.parser import tokens, ParserError from mwparserfromhell.parser.builder import Builder from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext @@ -420,5 +420,11 @@ class TestBuilder(TreeEqualityTestCase): named=True)]))])]) self.assertWikicodeEqual(valid, self.builder.build(test)) + def test_parser_error(self): + """test whether ParserError gets thrown for bad input""" + msg = r"_handle_token\(\) got unexpected TemplateClose" + self.assertRaisesRegexp( + ParserError, msg, self.builder.build, [tokens.TemplateClose()]) + if __name__ == "__main__": unittest.main(verbosity=2)