Raise ParserError for internal problems. Improve coverage. Cleanup.

10 years ago · 08cafc0576
--- a/.coveragerc
+++ b/.coveragerc
@@ -0,0 +1,4 @@
 [report]
 exclude_lines =
 	pragma: no cover
 	raise NotImplementedError()
--- a/.gitignore
+++ b/.gitignore
@@ -4,9 +4,11 @@
 *.dll
 *.egg
 *.egg-info
 .coverage
 .DS_Store
 __pycache__
 build
 dist
 docs/_build
 scripts/*.log
 htmlcov/
--- a/+ 3
+++ b/+ 3
@@ -10,6 +10,9 @@ v0.4 (unreleased):
  option, RECURSE_OTHERS, which recurses over all children except instances of
  'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)`
  returns all un-nested templates).
 - If something goes wrong while parsing, ParserError will now be raised.
  Previously, the parser would produce an unclear BadRoute exception or allow
  an incorrect node tree to be build.
 - Fixed a parser bug involving nested tags.
 - Updated and fixed some documentation.

--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -18,6 +18,9 @@ Unreleased
  which recurses over all children except instances of *forcetype* (for
  example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested
  templates).
 - If something goes wrong while parsing, :py:exc:`.ParserError` will now be
  raised. Previously, the parser would produce an unclear :py:exc:`.BadRoute`
  exception or allow an incorrect node tree to be build.
 - Fixed a parser bug involving nested tags.
 - Updated and fixed some documentation.

--- a/mwparserfromhell/nodes/init.py
+++ b/mwparserfromhell/nodes/init.py
@@ -55,8 +55,8 @@ class Node(StringMixIn):
        raise NotImplementedError()

    def __children__(self):
        return  # Funny generator-that-yields-nothing syntax
        yield
        return
        yield  # pragma: no cover (this is a generator that yields nothing)

    def __strip__(self, normalize, collapse):
        return None
--- a/mwparserfromhell/parser/init.py
+++ b/mwparserfromhell/parser/init.py
@@ -26,6 +26,19 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module
 joins them together under one interface.
 """

 class ParserError(Exception):
    """Exception raised when an internal error occurs while parsing.

    This does not mean that the wikicode was invalid, because invalid markup
    should still be parsed correctly. This means that the parser caught itself
    with an impossible internal state and is bailing out before other problems
    can happen. Its appearance indicates a bug.
    """
    def __init__(self, extra):
        msg = "This is a bug and should be reported. Info: {0}.".format(extra)
        super(ParserError, self).__init__(msg)


 from .builder import Builder
 from .tokenizer import Tokenizer
 try:
@@ -35,15 +48,22 @@ except ImportError:
    CTokenizer = None
    use_c = False

 __all__ = ["use_c", "Parser"]
 __all__ = ["use_c", "Parser", "ParserError"]

 class Parser(object):
    """Represents a parser for wikicode.

    Actual parsing is a two-step process: first, the text is split up into a
    series of tokens by the :py:class:`~.Tokenizer`, and then the tokens are
    converted into trees of :py:class:`~.Wikicode` objects and
    :py:class:`~.Node`\ s by the :py:class:`~.Builder`.
    series of tokens by the :py:class:`.Tokenizer`, and then the tokens are
    converted into trees of :py:class:`.Wikicode` objects and
    :py:class:`.Node`\ s by the :py:class:`.Builder`.

    Instances of this class or its dependents (:py:class:`.Tokenizer` and
    :py:class:`.Builder`) should not be shared between threads.
    :py:meth:`parse` can be called multiple times as long as it is not done
    concurrently. In general, there is no need to do this because parsing
    should be done through :py:func:`mwparserfromhell.parse`, which creates a
    new :py:class:`.Parser` object as necessary.
    """

    def __init__(self):
@@ -65,6 +85,9 @@ class Parser(object):

        If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be
        parsed, but instead will be treated as plain text.

        If there is an internal error while parsing, :py:exc:`.ParserError`
        will be raised.
        """
        tokens = self._tokenizer.tokenize(text, context, skip_style_tags)
        code = self._builder.build(tokens)
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -22,7 +22,7 @@

 from __future__ import unicode_literals

 from . import tokens
 from . import tokens, ParserError
 from ..compat import str
 from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
                     Template, Text, Wikilink)
@@ -33,33 +33,28 @@ from ..wikicode import Wikicode
 __all__ = ["Builder"]

 class Builder(object):
    """Combines a sequence of tokens into a tree of ``Wikicode`` objects.
    """Builds a tree of nodes out of a sequence of tokens.

    To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build`
    method. The list will be exhausted as it is parsed and a
    :py:class:`~.Wikicode` object will be returned.
    :py:class:`.Wikicode` object containing the node tree will be returned.
    """

    def __init__(self):
        self._tokens = []
        self._stacks = []

    def _wrap(self, nodes):
        """Properly wrap a list of nodes in a ``Wikicode`` object."""
        return Wikicode(SmartList(nodes))

    def _push(self):
        """Push a new node list onto the stack."""
        self._stacks.append([])

    def _pop(self, wrap=True):
    def _pop(self):
        """Pop the current node list off of the stack.

        If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list.
        The raw node list is wrapped in a :py:class:`.SmartList` and then in a
        :py:class:`.Wikicode` object.
        """
        if wrap:
            return self._wrap(self._stacks.pop())
        return self._stacks.pop()
        return Wikicode(SmartList(self._stacks.pop()))

    def _write(self, item):
        """Append a node to the current node list."""
@@ -84,7 +79,7 @@ class Builder(object):
                self._tokens.append(token)
                value = self._pop()
                if key is None:
                    key = self._wrap([Text(str(default))])
                    key = Wikicode(SmartList([Text(str(default))]))
                return Parameter(key, value, showkey)
            else:
                self._write(self._handle_token(token))
@@ -270,6 +265,8 @@ class Builder(object):
            return self._handle_comment()
        elif isinstance(token, tokens.TagOpenOpen):
            return self._handle_tag(token)
        err = "_handle_token() got unexpected {0}".format(type(token).__name__)
        raise ParserError(err)

    def build(self, tokenlist):
        """Build a Wikicode object from a list tokens and return it."""
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -347,7 +347,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)

 /*
    Fail the current tokenization route. Discards the current
    stack/context/textbuffer and raises a BadRoute exception.
    stack/context/textbuffer and sets the BAD_ROUTE flag.
 */
 static void* Tokenizer_fail_route(Tokenizer* self)
 {
@@ -2681,7 +2681,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
 */
 static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
 {
    PyObject *text, *temp;
    PyObject *text, *temp, *tokens;
    int context = 0, skip_style_tags = 0;

    if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
@@ -2704,13 +2704,29 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
        Py_XDECREF(temp);
        self->text = text;
    }

    self->head = self->global = self->depth = self->cycles = 0;
    self->length = PyList_GET_SIZE(self->text);
    self->skip_style_tags = skip_style_tags;
    return Tokenizer_parse(self, context, 1);
    tokens = Tokenizer_parse(self, context, 1);

    if (!tokens && !PyErr_Occurred()) {
        if (!ParserError) {
            if (load_exceptions())
                return NULL;
        }
        if (BAD_ROUTE) {
            RESET_ROUTE();
            PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
        }
        else
            PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
        return NULL;
    }
    return tokens;
 }

 static int load_entitydefs(void)
 static int load_entities(void)
 {
    PyObject *tempmod, *defmap, *deflist;
    unsigned numdefs, i;
@@ -2814,7 +2830,7 @@ static int load_tokens(void)
    return 0;
 }

 static int load_definitions(void)
 static int load_defs(void)
 {
    PyObject *tempmod,
             *globals = PyEval_GetGlobals(),
@@ -2835,6 +2851,29 @@ static int load_definitions(void)
    return 0;
 }

 static int load_exceptions(void)
 {
    PyObject *tempmod, *parsermod,
             *globals = PyEval_GetGlobals(),
             *locals = PyEval_GetLocals(),
             *fromlist = PyList_New(1),
             *modname = IMPORT_NAME_FUNC("parser");
    char *name = "mwparserfromhell";

    if (!fromlist || !modname)
        return -1;
    PyList_SET_ITEM(fromlist, 0, modname);
    tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
    Py_DECREF(fromlist);
    if (!tempmod)
        return -1;
    parsermod = PyObject_GetAttrString(tempmod, "parser");
    Py_DECREF(tempmod);
    ParserError = PyObject_GetAttrString(parsermod, "ParserError");
    Py_DECREF(parsermod);
    return 0;
 }

 PyMODINIT_FUNC INIT_FUNC_NAME(void)
 {
    PyObject *module;
@@ -2851,9 +2890,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void)
    PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
    EMPTY = PyUnicode_FromString("");
    NOARGS = PyTuple_New(0);
    if (!EMPTY || !NOARGS)
        INIT_ERROR;
    if (load_entitydefs() || load_tokens() || load_definitions())
    if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs())
        INIT_ERROR;
 #ifdef IS_PY3K
    return module;
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -62,6 +62,7 @@ static char** entitydefs;

 static PyObject* EMPTY;
 static PyObject* NOARGS;
 static PyObject* ParserError;
 static PyObject* definitions;


@@ -268,6 +269,8 @@ static int Tokenizer_parse_tag(Tokenizer*);
 static PyObject* Tokenizer_parse(Tokenizer*, int, int);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

 static int load_exceptions(void);


 /* Macros for Python 2/3 compatibility: */

--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -24,7 +24,7 @@ from __future__ import unicode_literals
 from math import log
 import re

 from . import contexts, tokens
 from . import contexts, tokens, ParserError
 from ..compat import htmlentities, range
 from ..definitions import (get_html_tag, is_parsable, is_single,
                           is_single_only, is_scheme)
@@ -1154,4 +1154,7 @@ class Tokenizer(object):
        split = self.regex.split(text)
        self._text = [segment for segment in split if segment]
        self._head = self._global = self._depth = self._cycles = 0
        return self._parse(context)
        try:
            return self._parse(context)
        except BadRoute:  # pragma: no cover (untestable/exceptional case)
            raise ParserError("Python tokenizer exited with BadRoute")
--- a/mwparserfromhell/parser/tokens.py
+++ b/mwparserfromhell/parser/tokens.py
@@ -34,7 +34,7 @@ from ..compat import py3k, str

 __all__ = ["Token"]

 class Token (dict):
 class Token(dict):
    """A token stores the semantic meaning of a unit of wikicode."""

    def __repr__(self):
--- a/mwparserfromhell/utils.py
+++ b/mwparserfromhell/utils.py
@@ -66,7 +66,7 @@ def parse_anything(value, context=0, skip_style_tags=False):
        nodelist = SmartList()
        for item in value:
            nodelist += parse_anything(item, context, skip_style_tags).nodes
        return Wikicode(nodelist)
    except TypeError:
        error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}"
        raise ValueError(error.format(type(value).__name__, value))
    return Wikicode(nodelist)
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -30,7 +30,7 @@ except ImportError:
 from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading,
                                    HTMLEntity, Tag, Template, Text, Wikilink)
 from mwparserfromhell.nodes.extras import Attribute, Parameter
 from mwparserfromhell.parser import tokens
 from mwparserfromhell.parser import tokens, ParserError
 from mwparserfromhell.parser.builder import Builder

 from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -420,5 +420,11 @@ class TestBuilder(TreeEqualityTestCase):
            named=True)]))])])
        self.assertWikicodeEqual(valid, self.builder.build(test))

    def test_parser_error(self):
        """test whether ParserError gets thrown for bad input"""
        msg = r"_handle_token\(\) got unexpected TemplateClose"
        self.assertRaisesRegexp(
            ParserError, msg, self.builder.build, [tokens.TemplateClose()])

 if __name__ == "__main__":
    unittest.main(verbosity=2)