@@ -0,0 +1,4 @@ | |||
[report] | |||
exclude_lines = | |||
pragma: no cover | |||
raise NotImplementedError() |
@@ -4,9 +4,11 @@ | |||
*.dll | |||
*.egg | |||
*.egg-info | |||
.coverage | |||
.DS_Store | |||
__pycache__ | |||
build | |||
dist | |||
docs/_build | |||
scripts/*.log | |||
htmlcov/ |
@@ -10,6 +10,9 @@ v0.4 (unreleased): | |||
option, RECURSE_OTHERS, which recurses over all children except instances of | |||
'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` | |||
returns all un-nested templates). | |||
- If something goes wrong while parsing, ParserError will now be raised. | |||
Previously, the parser would produce an unclear BadRoute exception or allow | |||
an incorrect node tree to be build. | |||
- Fixed a parser bug involving nested tags. | |||
- Updated and fixed some documentation. | |||
@@ -18,6 +18,9 @@ Unreleased | |||
which recurses over all children except instances of *forcetype* (for | |||
example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested | |||
templates). | |||
- If something goes wrong while parsing, :py:exc:`.ParserError` will now be | |||
raised. Previously, the parser would produce an unclear :py:exc:`.BadRoute` | |||
exception or allow an incorrect node tree to be build. | |||
- Fixed a parser bug involving nested tags. | |||
- Updated and fixed some documentation. | |||
@@ -55,8 +55,8 @@ class Node(StringMixIn): | |||
raise NotImplementedError() | |||
def __children__(self): | |||
return # Funny generator-that-yields-nothing syntax | |||
yield | |||
return | |||
yield # pragma: no cover (this is a generator that yields nothing) | |||
def __strip__(self, normalize, collapse): | |||
return None | |||
@@ -26,6 +26,19 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module | |||
joins them together under one interface. | |||
""" | |||
class ParserError(Exception): | |||
"""Exception raised when an internal error occurs while parsing. | |||
This does not mean that the wikicode was invalid, because invalid markup | |||
should still be parsed correctly. This means that the parser caught itself | |||
with an impossible internal state and is bailing out before other problems | |||
can happen. Its appearance indicates a bug. | |||
""" | |||
def __init__(self, extra): | |||
msg = "This is a bug and should be reported. Info: {0}.".format(extra) | |||
super(ParserError, self).__init__(msg) | |||
from .builder import Builder | |||
from .tokenizer import Tokenizer | |||
try: | |||
@@ -35,15 +48,22 @@ except ImportError: | |||
CTokenizer = None | |||
use_c = False | |||
__all__ = ["use_c", "Parser"] | |||
__all__ = ["use_c", "Parser", "ParserError"] | |||
class Parser(object): | |||
"""Represents a parser for wikicode. | |||
Actual parsing is a two-step process: first, the text is split up into a | |||
series of tokens by the :py:class:`~.Tokenizer`, and then the tokens are | |||
converted into trees of :py:class:`~.Wikicode` objects and | |||
:py:class:`~.Node`\ s by the :py:class:`~.Builder`. | |||
series of tokens by the :py:class:`.Tokenizer`, and then the tokens are | |||
converted into trees of :py:class:`.Wikicode` objects and | |||
:py:class:`.Node`\ s by the :py:class:`.Builder`. | |||
Instances of this class or its dependents (:py:class:`.Tokenizer` and | |||
:py:class:`.Builder`) should not be shared between threads. | |||
:py:meth:`parse` can be called multiple times as long as it is not done | |||
concurrently. In general, there is no need to do this because parsing | |||
should be done through :py:func:`mwparserfromhell.parse`, which creates a | |||
new :py:class:`.Parser` object as necessary. | |||
""" | |||
def __init__(self): | |||
@@ -65,6 +85,9 @@ class Parser(object): | |||
If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be | |||
parsed, but instead will be treated as plain text. | |||
If there is an internal error while parsing, :py:exc:`.ParserError` | |||
will be raised. | |||
""" | |||
tokens = self._tokenizer.tokenize(text, context, skip_style_tags) | |||
code = self._builder.build(tokens) | |||
@@ -22,7 +22,7 @@ | |||
from __future__ import unicode_literals | |||
from . import tokens | |||
from . import tokens, ParserError | |||
from ..compat import str | |||
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, | |||
Template, Text, Wikilink) | |||
@@ -33,33 +33,28 @@ from ..wikicode import Wikicode | |||
__all__ = ["Builder"] | |||
class Builder(object): | |||
"""Combines a sequence of tokens into a tree of ``Wikicode`` objects. | |||
"""Builds a tree of nodes out of a sequence of tokens. | |||
To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build` | |||
method. The list will be exhausted as it is parsed and a | |||
:py:class:`~.Wikicode` object will be returned. | |||
:py:class:`.Wikicode` object containing the node tree will be returned. | |||
""" | |||
def __init__(self): | |||
self._tokens = [] | |||
self._stacks = [] | |||
def _wrap(self, nodes): | |||
"""Properly wrap a list of nodes in a ``Wikicode`` object.""" | |||
return Wikicode(SmartList(nodes)) | |||
def _push(self): | |||
"""Push a new node list onto the stack.""" | |||
self._stacks.append([]) | |||
def _pop(self, wrap=True): | |||
def _pop(self): | |||
"""Pop the current node list off of the stack. | |||
If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list. | |||
The raw node list is wrapped in a :py:class:`.SmartList` and then in a | |||
:py:class:`.Wikicode` object. | |||
""" | |||
if wrap: | |||
return self._wrap(self._stacks.pop()) | |||
return self._stacks.pop() | |||
return Wikicode(SmartList(self._stacks.pop())) | |||
def _write(self, item): | |||
"""Append a node to the current node list.""" | |||
@@ -84,7 +79,7 @@ class Builder(object): | |||
self._tokens.append(token) | |||
value = self._pop() | |||
if key is None: | |||
key = self._wrap([Text(str(default))]) | |||
key = Wikicode(SmartList([Text(str(default))])) | |||
return Parameter(key, value, showkey) | |||
else: | |||
self._write(self._handle_token(token)) | |||
@@ -270,6 +265,8 @@ class Builder(object): | |||
return self._handle_comment() | |||
elif isinstance(token, tokens.TagOpenOpen): | |||
return self._handle_tag(token) | |||
err = "_handle_token() got unexpected {0}".format(type(token).__name__) | |||
raise ParserError(err) | |||
def build(self, tokenlist): | |||
"""Build a Wikicode object from a list tokens and return it.""" | |||
@@ -347,7 +347,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | |||
/* | |||
Fail the current tokenization route. Discards the current | |||
stack/context/textbuffer and raises a BadRoute exception. | |||
stack/context/textbuffer and sets the BAD_ROUTE flag. | |||
*/ | |||
static void* Tokenizer_fail_route(Tokenizer* self) | |||
{ | |||
@@ -2681,7 +2681,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
*/ | |||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
{ | |||
PyObject *text, *temp; | |||
PyObject *text, *temp, *tokens; | |||
int context = 0, skip_style_tags = 0; | |||
if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { | |||
@@ -2704,13 +2704,29 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
Py_XDECREF(temp); | |||
self->text = text; | |||
} | |||
self->head = self->global = self->depth = self->cycles = 0; | |||
self->length = PyList_GET_SIZE(self->text); | |||
self->skip_style_tags = skip_style_tags; | |||
return Tokenizer_parse(self, context, 1); | |||
tokens = Tokenizer_parse(self, context, 1); | |||
if (!tokens && !PyErr_Occurred()) { | |||
if (!ParserError) { | |||
if (load_exceptions()) | |||
return NULL; | |||
} | |||
if (BAD_ROUTE) { | |||
RESET_ROUTE(); | |||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | |||
} | |||
else | |||
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); | |||
return NULL; | |||
} | |||
return tokens; | |||
} | |||
static int load_entitydefs(void) | |||
static int load_entities(void) | |||
{ | |||
PyObject *tempmod, *defmap, *deflist; | |||
unsigned numdefs, i; | |||
@@ -2814,7 +2830,7 @@ static int load_tokens(void) | |||
return 0; | |||
} | |||
static int load_definitions(void) | |||
static int load_defs(void) | |||
{ | |||
PyObject *tempmod, | |||
*globals = PyEval_GetGlobals(), | |||
@@ -2835,6 +2851,29 @@ static int load_definitions(void) | |||
return 0; | |||
} | |||
static int load_exceptions(void) | |||
{ | |||
PyObject *tempmod, *parsermod, | |||
*globals = PyEval_GetGlobals(), | |||
*locals = PyEval_GetLocals(), | |||
*fromlist = PyList_New(1), | |||
*modname = IMPORT_NAME_FUNC("parser"); | |||
char *name = "mwparserfromhell"; | |||
if (!fromlist || !modname) | |||
return -1; | |||
PyList_SET_ITEM(fromlist, 0, modname); | |||
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); | |||
Py_DECREF(fromlist); | |||
if (!tempmod) | |||
return -1; | |||
parsermod = PyObject_GetAttrString(tempmod, "parser"); | |||
Py_DECREF(tempmod); | |||
ParserError = PyObject_GetAttrString(parsermod, "ParserError"); | |||
Py_DECREF(parsermod); | |||
return 0; | |||
} | |||
PyMODINIT_FUNC INIT_FUNC_NAME(void) | |||
{ | |||
PyObject *module; | |||
@@ -2851,9 +2890,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) | |||
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); | |||
EMPTY = PyUnicode_FromString(""); | |||
NOARGS = PyTuple_New(0); | |||
if (!EMPTY || !NOARGS) | |||
INIT_ERROR; | |||
if (load_entitydefs() || load_tokens() || load_definitions()) | |||
if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs()) | |||
INIT_ERROR; | |||
#ifdef IS_PY3K | |||
return module; | |||
@@ -62,6 +62,7 @@ static char** entitydefs; | |||
static PyObject* EMPTY; | |||
static PyObject* NOARGS; | |||
static PyObject* ParserError; | |||
static PyObject* definitions; | |||
@@ -268,6 +269,8 @@ static int Tokenizer_parse_tag(Tokenizer*); | |||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | |||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | |||
static int load_exceptions(void); | |||
/* Macros for Python 2/3 compatibility: */ | |||
@@ -24,7 +24,7 @@ from __future__ import unicode_literals | |||
from math import log | |||
import re | |||
from . import contexts, tokens | |||
from . import contexts, tokens, ParserError | |||
from ..compat import htmlentities, range | |||
from ..definitions import (get_html_tag, is_parsable, is_single, | |||
is_single_only, is_scheme) | |||
@@ -1154,4 +1154,7 @@ class Tokenizer(object): | |||
split = self.regex.split(text) | |||
self._text = [segment for segment in split if segment] | |||
self._head = self._global = self._depth = self._cycles = 0 | |||
return self._parse(context) | |||
try: | |||
return self._parse(context) | |||
except BadRoute: # pragma: no cover (untestable/exceptional case) | |||
raise ParserError("Python tokenizer exited with BadRoute") |
@@ -34,7 +34,7 @@ from ..compat import py3k, str | |||
__all__ = ["Token"] | |||
class Token (dict): | |||
class Token(dict): | |||
"""A token stores the semantic meaning of a unit of wikicode.""" | |||
def __repr__(self): | |||
@@ -66,7 +66,7 @@ def parse_anything(value, context=0, skip_style_tags=False): | |||
nodelist = SmartList() | |||
for item in value: | |||
nodelist += parse_anything(item, context, skip_style_tags).nodes | |||
return Wikicode(nodelist) | |||
except TypeError: | |||
error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" | |||
raise ValueError(error.format(type(value).__name__, value)) | |||
return Wikicode(nodelist) |
@@ -30,7 +30,7 @@ except ImportError: | |||
from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, | |||
HTMLEntity, Tag, Template, Text, Wikilink) | |||
from mwparserfromhell.nodes.extras import Attribute, Parameter | |||
from mwparserfromhell.parser import tokens | |||
from mwparserfromhell.parser import tokens, ParserError | |||
from mwparserfromhell.parser.builder import Builder | |||
from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext | |||
@@ -420,5 +420,11 @@ class TestBuilder(TreeEqualityTestCase): | |||
named=True)]))])]) | |||
self.assertWikicodeEqual(valid, self.builder.build(test)) | |||
def test_parser_error(self): | |||
"""test whether ParserError gets thrown for bad input""" | |||
msg = r"_handle_token\(\) got unexpected TemplateClose" | |||
self.assertRaisesRegexp( | |||
ParserError, msg, self.builder.build, [tokens.TemplateClose()]) | |||
if __name__ == "__main__": | |||
unittest.main(verbosity=2) |