Bladeren bron

Raise ParserError for internal problems. Improve coverage. Cleanup.

tags/v0.4
Ben Kurtovic 10 jaren geleden
bovenliggende
commit
08cafc0576
13 gewijzigde bestanden met toevoegingen van 113 en 32 verwijderingen
  1. +4
    -0
      .coveragerc
  2. +2
    -0
      .gitignore
  3. +3
    -0
      CHANGELOG
  4. +3
    -0
      docs/changelog.rst
  5. +2
    -2
      mwparserfromhell/nodes/__init__.py
  6. +27
    -4
      mwparserfromhell/parser/__init__.py
  7. +10
    -13
      mwparserfromhell/parser/builder.py
  8. +45
    -8
      mwparserfromhell/parser/tokenizer.c
  9. +3
    -0
      mwparserfromhell/parser/tokenizer.h
  10. +5
    -2
      mwparserfromhell/parser/tokenizer.py
  11. +1
    -1
      mwparserfromhell/parser/tokens.py
  12. +1
    -1
      mwparserfromhell/utils.py
  13. +7
    -1
      tests/test_builder.py

+ 4
- 0
.coveragerc Bestand weergeven

@@ -0,0 +1,4 @@
[report]
exclude_lines =
pragma: no cover
raise NotImplementedError()

+ 2
- 0
.gitignore Bestand weergeven

@@ -4,9 +4,11 @@
*.dll
*.egg
*.egg-info
.coverage
.DS_Store
__pycache__
build
dist
docs/_build
scripts/*.log
htmlcov/

+ 3
- 0
CHANGELOG Bestand weergeven

@@ -10,6 +10,9 @@ v0.4 (unreleased):
option, RECURSE_OTHERS, which recurses over all children except instances of
'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)`
returns all un-nested templates).
- If something goes wrong while parsing, ParserError will now be raised.
Previously, the parser would produce an unclear BadRoute exception or allow
an incorrect node tree to be build.
- Fixed a parser bug involving nested tags.
- Updated and fixed some documentation.



+ 3
- 0
docs/changelog.rst Bestand weergeven

@@ -18,6 +18,9 @@ Unreleased
which recurses over all children except instances of *forcetype* (for
example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested
templates).
- If something goes wrong while parsing, :py:exc:`.ParserError` will now be
raised. Previously, the parser would produce an unclear :py:exc:`.BadRoute`
exception or allow an incorrect node tree to be build.
- Fixed a parser bug involving nested tags.
- Updated and fixed some documentation.



+ 2
- 2
mwparserfromhell/nodes/__init__.py Bestand weergeven

@@ -55,8 +55,8 @@ class Node(StringMixIn):
raise NotImplementedError()

def __children__(self):
return # Funny generator-that-yields-nothing syntax
yield
return
yield # pragma: no cover (this is a generator that yields nothing)

def __strip__(self, normalize, collapse):
return None


+ 27
- 4
mwparserfromhell/parser/__init__.py Bestand weergeven

@@ -26,6 +26,19 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module
joins them together under one interface.
"""

class ParserError(Exception):
"""Exception raised when an internal error occurs while parsing.

This does not mean that the wikicode was invalid, because invalid markup
should still be parsed correctly. This means that the parser caught itself
with an impossible internal state and is bailing out before other problems
can happen. Its appearance indicates a bug.
"""
def __init__(self, extra):
msg = "This is a bug and should be reported. Info: {0}.".format(extra)
super(ParserError, self).__init__(msg)


from .builder import Builder
from .tokenizer import Tokenizer
try:
@@ -35,15 +48,22 @@ except ImportError:
CTokenizer = None
use_c = False

__all__ = ["use_c", "Parser"]
__all__ = ["use_c", "Parser", "ParserError"]

class Parser(object):
"""Represents a parser for wikicode.

Actual parsing is a two-step process: first, the text is split up into a
series of tokens by the :py:class:`~.Tokenizer`, and then the tokens are
converted into trees of :py:class:`~.Wikicode` objects and
:py:class:`~.Node`\ s by the :py:class:`~.Builder`.
series of tokens by the :py:class:`.Tokenizer`, and then the tokens are
converted into trees of :py:class:`.Wikicode` objects and
:py:class:`.Node`\ s by the :py:class:`.Builder`.

Instances of this class or its dependents (:py:class:`.Tokenizer` and
:py:class:`.Builder`) should not be shared between threads.
:py:meth:`parse` can be called multiple times as long as it is not done
concurrently. In general, there is no need to do this because parsing
should be done through :py:func:`mwparserfromhell.parse`, which creates a
new :py:class:`.Parser` object as necessary.
"""

def __init__(self):
@@ -65,6 +85,9 @@ class Parser(object):

If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be
parsed, but instead will be treated as plain text.

If there is an internal error while parsing, :py:exc:`.ParserError`
will be raised.
"""
tokens = self._tokenizer.tokenize(text, context, skip_style_tags)
code = self._builder.build(tokens)


+ 10
- 13
mwparserfromhell/parser/builder.py Bestand weergeven

@@ -22,7 +22,7 @@

from __future__ import unicode_literals

from . import tokens
from . import tokens, ParserError
from ..compat import str
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
Template, Text, Wikilink)
@@ -33,33 +33,28 @@ from ..wikicode import Wikicode
__all__ = ["Builder"]

class Builder(object):
"""Combines a sequence of tokens into a tree of ``Wikicode`` objects.
"""Builds a tree of nodes out of a sequence of tokens.

To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build`
method. The list will be exhausted as it is parsed and a
:py:class:`~.Wikicode` object will be returned.
:py:class:`.Wikicode` object containing the node tree will be returned.
"""

def __init__(self):
self._tokens = []
self._stacks = []

def _wrap(self, nodes):
"""Properly wrap a list of nodes in a ``Wikicode`` object."""
return Wikicode(SmartList(nodes))

def _push(self):
"""Push a new node list onto the stack."""
self._stacks.append([])

def _pop(self, wrap=True):
def _pop(self):
"""Pop the current node list off of the stack.

If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list.
The raw node list is wrapped in a :py:class:`.SmartList` and then in a
:py:class:`.Wikicode` object.
"""
if wrap:
return self._wrap(self._stacks.pop())
return self._stacks.pop()
return Wikicode(SmartList(self._stacks.pop()))

def _write(self, item):
"""Append a node to the current node list."""
@@ -84,7 +79,7 @@ class Builder(object):
self._tokens.append(token)
value = self._pop()
if key is None:
key = self._wrap([Text(str(default))])
key = Wikicode(SmartList([Text(str(default))]))
return Parameter(key, value, showkey)
else:
self._write(self._handle_token(token))
@@ -270,6 +265,8 @@ class Builder(object):
return self._handle_comment()
elif isinstance(token, tokens.TagOpenOpen):
return self._handle_tag(token)
err = "_handle_token() got unexpected {0}".format(type(token).__name__)
raise ParserError(err)

def build(self, tokenlist):
"""Build a Wikicode object from a list tokens and return it."""


+ 45
- 8
mwparserfromhell/parser/tokenizer.c Bestand weergeven

@@ -347,7 +347,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)

/*
Fail the current tokenization route. Discards the current
stack/context/textbuffer and raises a BadRoute exception.
stack/context/textbuffer and sets the BAD_ROUTE flag.
*/
static void* Tokenizer_fail_route(Tokenizer* self)
{
@@ -2681,7 +2681,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
*/
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{
PyObject *text, *temp;
PyObject *text, *temp, *tokens;
int context = 0, skip_style_tags = 0;

if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
@@ -2704,13 +2704,29 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
Py_XDECREF(temp);
self->text = text;
}

self->head = self->global = self->depth = self->cycles = 0;
self->length = PyList_GET_SIZE(self->text);
self->skip_style_tags = skip_style_tags;
return Tokenizer_parse(self, context, 1);
tokens = Tokenizer_parse(self, context, 1);

if (!tokens && !PyErr_Occurred()) {
if (!ParserError) {
if (load_exceptions())
return NULL;
}
if (BAD_ROUTE) {
RESET_ROUTE();
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
}
else
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
return NULL;
}
return tokens;
}

static int load_entitydefs(void)
static int load_entities(void)
{
PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i;
@@ -2814,7 +2830,7 @@ static int load_tokens(void)
return 0;
}

static int load_definitions(void)
static int load_defs(void)
{
PyObject *tempmod,
*globals = PyEval_GetGlobals(),
@@ -2835,6 +2851,29 @@ static int load_definitions(void)
return 0;
}

static int load_exceptions(void)
{
PyObject *tempmod, *parsermod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("parser");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
return -1;
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
return -1;
parsermod = PyObject_GetAttrString(tempmod, "parser");
Py_DECREF(tempmod);
ParserError = PyObject_GetAttrString(parsermod, "ParserError");
Py_DECREF(parsermod);
return 0;
}

PyMODINIT_FUNC INIT_FUNC_NAME(void)
{
PyObject *module;
@@ -2851,9 +2890,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void)
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
EMPTY = PyUnicode_FromString("");
NOARGS = PyTuple_New(0);
if (!EMPTY || !NOARGS)
INIT_ERROR;
if (load_entitydefs() || load_tokens() || load_definitions())
if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs())
INIT_ERROR;
#ifdef IS_PY3K
return module;


+ 3
- 0
mwparserfromhell/parser/tokenizer.h Bestand weergeven

@@ -62,6 +62,7 @@ static char** entitydefs;

static PyObject* EMPTY;
static PyObject* NOARGS;
static PyObject* ParserError;
static PyObject* definitions;


@@ -268,6 +269,8 @@ static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

static int load_exceptions(void);


/* Macros for Python 2/3 compatibility: */



+ 5
- 2
mwparserfromhell/parser/tokenizer.py Bestand weergeven

@@ -24,7 +24,7 @@ from __future__ import unicode_literals
from math import log
import re

from . import contexts, tokens
from . import contexts, tokens, ParserError
from ..compat import htmlentities, range
from ..definitions import (get_html_tag, is_parsable, is_single,
is_single_only, is_scheme)
@@ -1154,4 +1154,7 @@ class Tokenizer(object):
split = self.regex.split(text)
self._text = [segment for segment in split if segment]
self._head = self._global = self._depth = self._cycles = 0
return self._parse(context)
try:
return self._parse(context)
except BadRoute: # pragma: no cover (untestable/exceptional case)
raise ParserError("Python tokenizer exited with BadRoute")

+ 1
- 1
mwparserfromhell/parser/tokens.py Bestand weergeven

@@ -34,7 +34,7 @@ from ..compat import py3k, str

__all__ = ["Token"]

class Token (dict):
class Token(dict):
"""A token stores the semantic meaning of a unit of wikicode."""

def __repr__(self):


+ 1
- 1
mwparserfromhell/utils.py Bestand weergeven

@@ -66,7 +66,7 @@ def parse_anything(value, context=0, skip_style_tags=False):
nodelist = SmartList()
for item in value:
nodelist += parse_anything(item, context, skip_style_tags).nodes
return Wikicode(nodelist)
except TypeError:
error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}"
raise ValueError(error.format(type(value).__name__, value))
return Wikicode(nodelist)

+ 7
- 1
tests/test_builder.py Bestand weergeven

@@ -30,7 +30,7 @@ except ImportError:
from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading,
HTMLEntity, Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes.extras import Attribute, Parameter
from mwparserfromhell.parser import tokens
from mwparserfromhell.parser import tokens, ParserError
from mwparserfromhell.parser.builder import Builder

from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -420,5 +420,11 @@ class TestBuilder(TreeEqualityTestCase):
named=True)]))])])
self.assertWikicodeEqual(valid, self.builder.build(test))

def test_parser_error(self):
"""test whether ParserError gets thrown for bad input"""
msg = r"_handle_token\(\) got unexpected TemplateClose"
self.assertRaisesRegexp(
ParserError, msg, self.builder.build, [tokens.TemplateClose()])

if __name__ == "__main__":
unittest.main(verbosity=2)

Laden…
Annuleren
Opslaan