@@ -0,0 +1,4 @@ | |||||
[report] | |||||
exclude_lines = | |||||
pragma: no cover | |||||
raise NotImplementedError() |
@@ -4,9 +4,11 @@ | |||||
*.dll | *.dll | ||||
*.egg | *.egg | ||||
*.egg-info | *.egg-info | ||||
.coverage | |||||
.DS_Store | .DS_Store | ||||
__pycache__ | __pycache__ | ||||
build | build | ||||
dist | dist | ||||
docs/_build | docs/_build | ||||
scripts/*.log | scripts/*.log | ||||
htmlcov/ |
@@ -10,6 +10,9 @@ v0.4 (unreleased): | |||||
option, RECURSE_OTHERS, which recurses over all children except instances of | option, RECURSE_OTHERS, which recurses over all children except instances of | ||||
'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` | 'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` | ||||
returns all un-nested templates). | returns all un-nested templates). | ||||
- If something goes wrong while parsing, ParserError will now be raised. | |||||
Previously, the parser would produce an unclear BadRoute exception or allow | |||||
an incorrect node tree to be build. | |||||
- Fixed a parser bug involving nested tags. | - Fixed a parser bug involving nested tags. | ||||
- Updated and fixed some documentation. | - Updated and fixed some documentation. | ||||
@@ -18,6 +18,9 @@ Unreleased | |||||
which recurses over all children except instances of *forcetype* (for | which recurses over all children except instances of *forcetype* (for | ||||
example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested | example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested | ||||
templates). | templates). | ||||
- If something goes wrong while parsing, :py:exc:`.ParserError` will now be | |||||
raised. Previously, the parser would produce an unclear :py:exc:`.BadRoute` | |||||
exception or allow an incorrect node tree to be build. | |||||
- Fixed a parser bug involving nested tags. | - Fixed a parser bug involving nested tags. | ||||
- Updated and fixed some documentation. | - Updated and fixed some documentation. | ||||
@@ -55,8 +55,8 @@ class Node(StringMixIn): | |||||
raise NotImplementedError() | raise NotImplementedError() | ||||
def __children__(self): | def __children__(self): | ||||
return # Funny generator-that-yields-nothing syntax | |||||
yield | |||||
return | |||||
yield # pragma: no cover (this is a generator that yields nothing) | |||||
def __strip__(self, normalize, collapse): | def __strip__(self, normalize, collapse): | ||||
return None | return None | ||||
@@ -26,6 +26,19 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module | |||||
joins them together under one interface. | joins them together under one interface. | ||||
""" | """ | ||||
class ParserError(Exception): | |||||
"""Exception raised when an internal error occurs while parsing. | |||||
This does not mean that the wikicode was invalid, because invalid markup | |||||
should still be parsed correctly. This means that the parser caught itself | |||||
with an impossible internal state and is bailing out before other problems | |||||
can happen. Its appearance indicates a bug. | |||||
""" | |||||
def __init__(self, extra): | |||||
msg = "This is a bug and should be reported. Info: {0}.".format(extra) | |||||
super(ParserError, self).__init__(msg) | |||||
from .builder import Builder | from .builder import Builder | ||||
from .tokenizer import Tokenizer | from .tokenizer import Tokenizer | ||||
try: | try: | ||||
@@ -35,15 +48,22 @@ except ImportError: | |||||
CTokenizer = None | CTokenizer = None | ||||
use_c = False | use_c = False | ||||
__all__ = ["use_c", "Parser"] | |||||
__all__ = ["use_c", "Parser", "ParserError"] | |||||
class Parser(object): | class Parser(object): | ||||
"""Represents a parser for wikicode. | """Represents a parser for wikicode. | ||||
Actual parsing is a two-step process: first, the text is split up into a | Actual parsing is a two-step process: first, the text is split up into a | ||||
series of tokens by the :py:class:`~.Tokenizer`, and then the tokens are | |||||
converted into trees of :py:class:`~.Wikicode` objects and | |||||
:py:class:`~.Node`\ s by the :py:class:`~.Builder`. | |||||
series of tokens by the :py:class:`.Tokenizer`, and then the tokens are | |||||
converted into trees of :py:class:`.Wikicode` objects and | |||||
:py:class:`.Node`\ s by the :py:class:`.Builder`. | |||||
Instances of this class or its dependents (:py:class:`.Tokenizer` and | |||||
:py:class:`.Builder`) should not be shared between threads. | |||||
:py:meth:`parse` can be called multiple times as long as it is not done | |||||
concurrently. In general, there is no need to do this because parsing | |||||
should be done through :py:func:`mwparserfromhell.parse`, which creates a | |||||
new :py:class:`.Parser` object as necessary. | |||||
""" | """ | ||||
def __init__(self): | def __init__(self): | ||||
@@ -65,6 +85,9 @@ class Parser(object): | |||||
If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be | If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be | ||||
parsed, but instead will be treated as plain text. | parsed, but instead will be treated as plain text. | ||||
If there is an internal error while parsing, :py:exc:`.ParserError` | |||||
will be raised. | |||||
""" | """ | ||||
tokens = self._tokenizer.tokenize(text, context, skip_style_tags) | tokens = self._tokenizer.tokenize(text, context, skip_style_tags) | ||||
code = self._builder.build(tokens) | code = self._builder.build(tokens) | ||||
@@ -22,7 +22,7 @@ | |||||
from __future__ import unicode_literals | from __future__ import unicode_literals | ||||
from . import tokens | |||||
from . import tokens, ParserError | |||||
from ..compat import str | from ..compat import str | ||||
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, | from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, | ||||
Template, Text, Wikilink) | Template, Text, Wikilink) | ||||
@@ -33,33 +33,28 @@ from ..wikicode import Wikicode | |||||
__all__ = ["Builder"] | __all__ = ["Builder"] | ||||
class Builder(object): | class Builder(object): | ||||
"""Combines a sequence of tokens into a tree of ``Wikicode`` objects. | |||||
"""Builds a tree of nodes out of a sequence of tokens. | |||||
To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build` | To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build` | ||||
method. The list will be exhausted as it is parsed and a | method. The list will be exhausted as it is parsed and a | ||||
:py:class:`~.Wikicode` object will be returned. | |||||
:py:class:`.Wikicode` object containing the node tree will be returned. | |||||
""" | """ | ||||
def __init__(self): | def __init__(self): | ||||
self._tokens = [] | self._tokens = [] | ||||
self._stacks = [] | self._stacks = [] | ||||
def _wrap(self, nodes): | |||||
"""Properly wrap a list of nodes in a ``Wikicode`` object.""" | |||||
return Wikicode(SmartList(nodes)) | |||||
def _push(self): | def _push(self): | ||||
"""Push a new node list onto the stack.""" | """Push a new node list onto the stack.""" | ||||
self._stacks.append([]) | self._stacks.append([]) | ||||
def _pop(self, wrap=True): | |||||
def _pop(self): | |||||
"""Pop the current node list off of the stack. | """Pop the current node list off of the stack. | ||||
If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list. | |||||
The raw node list is wrapped in a :py:class:`.SmartList` and then in a | |||||
:py:class:`.Wikicode` object. | |||||
""" | """ | ||||
if wrap: | |||||
return self._wrap(self._stacks.pop()) | |||||
return self._stacks.pop() | |||||
return Wikicode(SmartList(self._stacks.pop())) | |||||
def _write(self, item): | def _write(self, item): | ||||
"""Append a node to the current node list.""" | """Append a node to the current node list.""" | ||||
@@ -84,7 +79,7 @@ class Builder(object): | |||||
self._tokens.append(token) | self._tokens.append(token) | ||||
value = self._pop() | value = self._pop() | ||||
if key is None: | if key is None: | ||||
key = self._wrap([Text(str(default))]) | |||||
key = Wikicode(SmartList([Text(str(default))])) | |||||
return Parameter(key, value, showkey) | return Parameter(key, value, showkey) | ||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
@@ -270,6 +265,8 @@ class Builder(object): | |||||
return self._handle_comment() | return self._handle_comment() | ||||
elif isinstance(token, tokens.TagOpenOpen): | elif isinstance(token, tokens.TagOpenOpen): | ||||
return self._handle_tag(token) | return self._handle_tag(token) | ||||
err = "_handle_token() got unexpected {0}".format(type(token).__name__) | |||||
raise ParserError(err) | |||||
def build(self, tokenlist): | def build(self, tokenlist): | ||||
"""Build a Wikicode object from a list tokens and return it.""" | """Build a Wikicode object from a list tokens and return it.""" | ||||
@@ -347,7 +347,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | |||||
/* | /* | ||||
Fail the current tokenization route. Discards the current | Fail the current tokenization route. Discards the current | ||||
stack/context/textbuffer and raises a BadRoute exception. | |||||
stack/context/textbuffer and sets the BAD_ROUTE flag. | |||||
*/ | */ | ||||
static void* Tokenizer_fail_route(Tokenizer* self) | static void* Tokenizer_fail_route(Tokenizer* self) | ||||
{ | { | ||||
@@ -2681,7 +2681,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
*/ | */ | ||||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | ||||
{ | { | ||||
PyObject *text, *temp; | |||||
PyObject *text, *temp, *tokens; | |||||
int context = 0, skip_style_tags = 0; | int context = 0, skip_style_tags = 0; | ||||
if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { | if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { | ||||
@@ -2704,13 +2704,29 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
Py_XDECREF(temp); | Py_XDECREF(temp); | ||||
self->text = text; | self->text = text; | ||||
} | } | ||||
self->head = self->global = self->depth = self->cycles = 0; | self->head = self->global = self->depth = self->cycles = 0; | ||||
self->length = PyList_GET_SIZE(self->text); | self->length = PyList_GET_SIZE(self->text); | ||||
self->skip_style_tags = skip_style_tags; | self->skip_style_tags = skip_style_tags; | ||||
return Tokenizer_parse(self, context, 1); | |||||
tokens = Tokenizer_parse(self, context, 1); | |||||
if (!tokens && !PyErr_Occurred()) { | |||||
if (!ParserError) { | |||||
if (load_exceptions()) | |||||
return NULL; | |||||
} | |||||
if (BAD_ROUTE) { | |||||
RESET_ROUTE(); | |||||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | |||||
} | |||||
else | |||||
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); | |||||
return NULL; | |||||
} | |||||
return tokens; | |||||
} | } | ||||
static int load_entitydefs(void) | |||||
static int load_entities(void) | |||||
{ | { | ||||
PyObject *tempmod, *defmap, *deflist; | PyObject *tempmod, *defmap, *deflist; | ||||
unsigned numdefs, i; | unsigned numdefs, i; | ||||
@@ -2814,7 +2830,7 @@ static int load_tokens(void) | |||||
return 0; | return 0; | ||||
} | } | ||||
static int load_definitions(void) | |||||
static int load_defs(void) | |||||
{ | { | ||||
PyObject *tempmod, | PyObject *tempmod, | ||||
*globals = PyEval_GetGlobals(), | *globals = PyEval_GetGlobals(), | ||||
@@ -2835,6 +2851,29 @@ static int load_definitions(void) | |||||
return 0; | return 0; | ||||
} | } | ||||
static int load_exceptions(void) | |||||
{ | |||||
PyObject *tempmod, *parsermod, | |||||
*globals = PyEval_GetGlobals(), | |||||
*locals = PyEval_GetLocals(), | |||||
*fromlist = PyList_New(1), | |||||
*modname = IMPORT_NAME_FUNC("parser"); | |||||
char *name = "mwparserfromhell"; | |||||
if (!fromlist || !modname) | |||||
return -1; | |||||
PyList_SET_ITEM(fromlist, 0, modname); | |||||
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); | |||||
Py_DECREF(fromlist); | |||||
if (!tempmod) | |||||
return -1; | |||||
parsermod = PyObject_GetAttrString(tempmod, "parser"); | |||||
Py_DECREF(tempmod); | |||||
ParserError = PyObject_GetAttrString(parsermod, "ParserError"); | |||||
Py_DECREF(parsermod); | |||||
return 0; | |||||
} | |||||
PyMODINIT_FUNC INIT_FUNC_NAME(void) | PyMODINIT_FUNC INIT_FUNC_NAME(void) | ||||
{ | { | ||||
PyObject *module; | PyObject *module; | ||||
@@ -2851,9 +2890,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) | |||||
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); | PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); | ||||
EMPTY = PyUnicode_FromString(""); | EMPTY = PyUnicode_FromString(""); | ||||
NOARGS = PyTuple_New(0); | NOARGS = PyTuple_New(0); | ||||
if (!EMPTY || !NOARGS) | |||||
INIT_ERROR; | |||||
if (load_entitydefs() || load_tokens() || load_definitions()) | |||||
if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs()) | |||||
INIT_ERROR; | INIT_ERROR; | ||||
#ifdef IS_PY3K | #ifdef IS_PY3K | ||||
return module; | return module; | ||||
@@ -62,6 +62,7 @@ static char** entitydefs; | |||||
static PyObject* EMPTY; | static PyObject* EMPTY; | ||||
static PyObject* NOARGS; | static PyObject* NOARGS; | ||||
static PyObject* ParserError; | |||||
static PyObject* definitions; | static PyObject* definitions; | ||||
@@ -268,6 +269,8 @@ static int Tokenizer_parse_tag(Tokenizer*); | |||||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | static PyObject* Tokenizer_parse(Tokenizer*, int, int); | ||||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | ||||
static int load_exceptions(void); | |||||
/* Macros for Python 2/3 compatibility: */ | /* Macros for Python 2/3 compatibility: */ | ||||
@@ -24,7 +24,7 @@ from __future__ import unicode_literals | |||||
from math import log | from math import log | ||||
import re | import re | ||||
from . import contexts, tokens | |||||
from . import contexts, tokens, ParserError | |||||
from ..compat import htmlentities, range | from ..compat import htmlentities, range | ||||
from ..definitions import (get_html_tag, is_parsable, is_single, | from ..definitions import (get_html_tag, is_parsable, is_single, | ||||
is_single_only, is_scheme) | is_single_only, is_scheme) | ||||
@@ -1154,4 +1154,7 @@ class Tokenizer(object): | |||||
split = self.regex.split(text) | split = self.regex.split(text) | ||||
self._text = [segment for segment in split if segment] | self._text = [segment for segment in split if segment] | ||||
self._head = self._global = self._depth = self._cycles = 0 | self._head = self._global = self._depth = self._cycles = 0 | ||||
return self._parse(context) | |||||
try: | |||||
return self._parse(context) | |||||
except BadRoute: # pragma: no cover (untestable/exceptional case) | |||||
raise ParserError("Python tokenizer exited with BadRoute") |
@@ -34,7 +34,7 @@ from ..compat import py3k, str | |||||
__all__ = ["Token"] | __all__ = ["Token"] | ||||
class Token (dict): | |||||
class Token(dict): | |||||
"""A token stores the semantic meaning of a unit of wikicode.""" | """A token stores the semantic meaning of a unit of wikicode.""" | ||||
def __repr__(self): | def __repr__(self): | ||||
@@ -66,7 +66,7 @@ def parse_anything(value, context=0, skip_style_tags=False): | |||||
nodelist = SmartList() | nodelist = SmartList() | ||||
for item in value: | for item in value: | ||||
nodelist += parse_anything(item, context, skip_style_tags).nodes | nodelist += parse_anything(item, context, skip_style_tags).nodes | ||||
return Wikicode(nodelist) | |||||
except TypeError: | except TypeError: | ||||
error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" | error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" | ||||
raise ValueError(error.format(type(value).__name__, value)) | raise ValueError(error.format(type(value).__name__, value)) | ||||
return Wikicode(nodelist) |
@@ -30,7 +30,7 @@ except ImportError: | |||||
from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, | from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, | ||||
HTMLEntity, Tag, Template, Text, Wikilink) | HTMLEntity, Tag, Template, Text, Wikilink) | ||||
from mwparserfromhell.nodes.extras import Attribute, Parameter | from mwparserfromhell.nodes.extras import Attribute, Parameter | ||||
from mwparserfromhell.parser import tokens | |||||
from mwparserfromhell.parser import tokens, ParserError | |||||
from mwparserfromhell.parser.builder import Builder | from mwparserfromhell.parser.builder import Builder | ||||
from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext | from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext | ||||
@@ -420,5 +420,11 @@ class TestBuilder(TreeEqualityTestCase): | |||||
named=True)]))])]) | named=True)]))])]) | ||||
self.assertWikicodeEqual(valid, self.builder.build(test)) | self.assertWikicodeEqual(valid, self.builder.build(test)) | ||||
def test_parser_error(self): | |||||
"""test whether ParserError gets thrown for bad input""" | |||||
msg = r"_handle_token\(\) got unexpected TemplateClose" | |||||
self.assertRaisesRegexp( | |||||
ParserError, msg, self.builder.build, [tokens.TemplateClose()]) | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
unittest.main(verbosity=2) | unittest.main(verbosity=2) |