Browse Source

Finish documenting all of the main things (#5).

tags/v0.1
Ben Kurtovic 11 years ago
parent
commit
e787c97712
3 changed files with 57 additions and 2 deletions
  1. +2
    -2
      mwparserfromhell/parser/builder.py
  2. +42
    -0
      mwparserfromhell/parser/tokenizer.py
  3. +13
    -0
      mwparserfromhell/parser/tokens.py

+ 2
- 2
mwparserfromhell/parser/builder.py View File

@@ -53,7 +53,7 @@ class Builder(object):
self._stacks.append([]) self._stacks.append([])


def _pop(self, wrap=True): def _pop(self, wrap=True):
"""Pop the topmost node list off of the stack.
"""Pop the current node list off of the stack.


If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list. If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list.
""" """
@@ -62,7 +62,7 @@ class Builder(object):
return self._stacks.pop() return self._stacks.pop()


def _write(self, item): def _write(self, item):
"""Append a node to the topmost node list."""
"""Append a node to the current node list."""
self._stacks[-1].append(item) self._stacks[-1].append(item)


def _handle_parameter(self, default): def _handle_parameter(self, default):


+ 42
- 0
mwparserfromhell/parser/tokenizer.py View File

@@ -32,10 +32,12 @@ from ..compat import htmlentities
__all__ = ["Tokenizer"] __all__ = ["Tokenizer"]


class BadRoute(Exception): class BadRoute(Exception):
"""Raised internally when the current tokenization route ss invalid."""
pass pass




class Tokenizer(object): class Tokenizer(object):
"""Creates a list of tokens from a string of wikicode."""
START = object() START = object()
END = object() END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
@@ -50,10 +52,12 @@ class Tokenizer(object):


@property @property
def _stack(self): def _stack(self):
"""The current token stack."""
return self._stacks[-1][0] return self._stacks[-1][0]


@property @property
def _context(self): def _context(self):
"""The current token context."""
return self._stacks[-1][1] return self._stacks[-1][1]


@_context.setter @_context.setter
@@ -62,6 +66,7 @@ class Tokenizer(object):


@property @property
def _textbuffer(self): def _textbuffer(self):
"""Return the current textbuffer."""
return self._stacks[-1][2] return self._stacks[-1][2]


@_textbuffer.setter @_textbuffer.setter
@@ -69,35 +74,57 @@ class Tokenizer(object):
self._stacks[-1][2] = value self._stacks[-1][2] = value


def _push(self, context=0): def _push(self, context=0):
"""Add a new token stack, context, and textbuffer to the list."""
self._stacks.append([[], context, []]) self._stacks.append([[], context, []])


def _push_textbuffer(self): def _push_textbuffer(self):
"""Push the textbuffer onto the stack as a Text node and clear it."""
if self._textbuffer: if self._textbuffer:
self._stack.append(tokens.Text(text="".join(self._textbuffer))) self._stack.append(tokens.Text(text="".join(self._textbuffer)))
self._textbuffer = [] self._textbuffer = []


def _pop(self): def _pop(self):
"""Pop the current stack/context/textbuffer, returing the stack."""
self._push_textbuffer() self._push_textbuffer()
return self._stacks.pop()[0] return self._stacks.pop()[0]


def _fail_route(self): def _fail_route(self):
"""Fail the current tokenization route.

Discards the current stack/context/textbuffer and raises
:py:exc:`~mwparserfromhell.parser.tokenizer.BadRoute`.
"""
self._pop() self._pop()
raise BadRoute() raise BadRoute()


def _write(self, token): def _write(self, token):
"""Write a token to the current token stack."""
self._push_textbuffer() self._push_textbuffer()
self._stack.append(token) self._stack.append(token)


def _write_text(self, text): def _write_text(self, text):
"""Write text to the current textbuffer."""
self._textbuffer.append(text) self._textbuffer.append(text)


def _write_all(self, tokenlist): def _write_all(self, tokenlist):
"""Write a series of tokens to the current stack at once."""
if tokenlist and isinstance(tokenlist[0], tokens.Text): if tokenlist and isinstance(tokenlist[0], tokens.Text):
self._write_text(tokenlist.pop(0).text) self._write_text(tokenlist.pop(0).text)
self._push_textbuffer() self._push_textbuffer()
self._stack.extend(tokenlist) self._stack.extend(tokenlist)


def _read(self, delta=0, wrap=False, strict=False): def _read(self, delta=0, wrap=False, strict=False):
"""Read the value at a relative point in the wikicode.

The value is read from :py:attr:`self._head <_head>` plus the value of
*delta* (which can be negative). If *wrap* is ``False``, we will not
allow attempts to read from the end of the string if ``self._head +
delta`` is negative. If *strict* is ``True``, the route will be failed
(with ``:py:meth:`_fail_route`) if we try to read from past the end of
the string; otherwise, :py:attr:`self.END <END>` is returned. If we try
to read from before the start of the string, :py:attr:`self.START
<START>` is returned.
"""
index = self._head + delta index = self._head + delta
if index < 0 and (not wrap or abs(index) > len(self._text)): if index < 0 and (not wrap or abs(index) > len(self._text)):
return self.START return self.START
@@ -109,6 +136,7 @@ class Tokenizer(object):
return self.END return self.END


def _parse_template(self): def _parse_template(self):
"""Parse a template at the head of the wikicode string."""
reset = self._head reset = self._head
self._head += 2 self._head += 2
try: try:
@@ -122,6 +150,11 @@ class Tokenizer(object):
self._write(tokens.TemplateClose()) self._write(tokens.TemplateClose())


def _verify_template_name(self): def _verify_template_name(self):
"""Verify that a template's name is valid wikisyntax.

The route will be failed if the name contains a newline inside of it
(not merely at the beginning or end).
"""
self._push_textbuffer() self._push_textbuffer()
if self._stack: if self._stack:
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
@@ -130,6 +163,7 @@ class Tokenizer(object):
self._fail_route() self._fail_route()


def _handle_template_param(self): def _handle_template_param(self):
"""Handle a template parameter at the head of the string."""
if self._context & contexts.TEMPLATE_NAME: if self._context & contexts.TEMPLATE_NAME:
self._verify_template_name() self._verify_template_name()
self._context ^= contexts.TEMPLATE_NAME self._context ^= contexts.TEMPLATE_NAME
@@ -139,17 +173,20 @@ class Tokenizer(object):
self._write(tokens.TemplateParamSeparator()) self._write(tokens.TemplateParamSeparator())


def _handle_template_param_value(self): def _handle_template_param_value(self):
"""Handle a template parameter's value at the head of the string."""
self._context ^= contexts.TEMPLATE_PARAM_KEY self._context ^= contexts.TEMPLATE_PARAM_KEY
self._context |= contexts.TEMPLATE_PARAM_VALUE self._context |= contexts.TEMPLATE_PARAM_VALUE
self._write(tokens.TemplateParamEquals()) self._write(tokens.TemplateParamEquals())


def _handle_template_end(self): def _handle_template_end(self):
"""Handle the end of the template at the head of the string."""
if self._context & contexts.TEMPLATE_NAME: if self._context & contexts.TEMPLATE_NAME:
self._verify_template_name() self._verify_template_name()
self._head += 1 self._head += 1
return self._pop() return self._pop()


def _parse_heading(self): def _parse_heading(self):
"""Parse a section heading at the head of the wikicode string."""
self._global |= contexts.GL_HEADING self._global |= contexts.GL_HEADING
reset = self._head reset = self._head
self._head += 1 self._head += 1
@@ -174,6 +211,7 @@ class Tokenizer(object):
self._global ^= contexts.GL_HEADING self._global ^= contexts.GL_HEADING


def _handle_heading_end(self): def _handle_heading_end(self):
"""Handle the end of a section heading at the head of the string."""
reset = self._head reset = self._head
self._head += 1 self._head += 1
best = 1 best = 1
@@ -196,6 +234,7 @@ class Tokenizer(object):
return self._pop(), after_level return self._pop(), after_level


def _really_parse_entity(self): def _really_parse_entity(self):
"""Actually parse a HTML entity and ensure that it is valid."""
self._write(tokens.HTMLEntityStart()) self._write(tokens.HTMLEntityStart())
self._head += 1 self._head += 1


@@ -237,6 +276,7 @@ class Tokenizer(object):
self._write(tokens.HTMLEntityEnd()) self._write(tokens.HTMLEntityEnd())


def _parse_entity(self): def _parse_entity(self):
"""Parse a HTML entity at the head of the wikicode string."""
reset = self._head reset = self._head
self._push() self._push()
try: try:
@@ -248,6 +288,7 @@ class Tokenizer(object):
self._write_all(self._pop()) self._write_all(self._pop())


def _parse(self, context=0): def _parse(self, context=0):
"""Parse the wikicode string, using *context* for when to stop."""
self._push(context) self._push(context)
while True: while True:
this = self._read() this = self._read()
@@ -281,6 +322,7 @@ class Tokenizer(object):
self._head += 1 self._head += 1


def tokenize(self, text): def tokenize(self, text):
"""Build a list of tokens from a string of wikicode and return it."""
split = self.regex.split(text) split = self.regex.split(text)
self._text = [segment for segment in split if segment] self._text = [segment for segment in split if segment]
return self._parse() return self._parse()

+ 13
- 0
mwparserfromhell/parser/tokens.py View File

@@ -20,6 +20,16 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


"""
This module contains the token definitions that are used as an intermediate
parsing data type - they are stored in a flat list, with each token being
identified by its type and optional attributes. The token list is generated in
a syntactically valid form by the
:py:class:`~mwparserfromhell.parser.tokenizer.Tokenizer`, and then converted
into the :py:class`~mwparserfromhell.wikicode.Wikicode` tree by the
:py:class:`~mwparserfromhell.parser.builder.Builder`.
"""

from __future__ import unicode_literals from __future__ import unicode_literals


from ..compat import basestring, py3k from ..compat import basestring, py3k
@@ -27,6 +37,8 @@ from ..compat import basestring, py3k
__all__ = ["Token"] __all__ = ["Token"]


class Token(object): class Token(object):
"""A token represents the semantic meaning of a unit of wikicode."""

def __init__(self, **kwargs): def __init__(self, **kwargs):
super(Token, self).__setattr__("_kwargs", kwargs) super(Token, self).__setattr__("_kwargs", kwargs)


@@ -54,6 +66,7 @@ class Token(object):
del self._kwargs[key] del self._kwargs[key]


def make(name): def make(name):
"""Create a new Token class using ``type()`` and add it to ``__all__``."""
__all__.append(name) __all__.append(name)
return type(name if py3k else name.encode("utf8"), (Token,), {}) return type(name if py3k else name.encode("utf8"), (Token,), {})




Loading…
Cancel
Save