From e787c97712af6d5760b621445cd065b844a43e20 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 21 Aug 2012 19:42:29 -0400 Subject: [PATCH] Finish documenting all of the main things (#5). --- mwparserfromhell/parser/builder.py | 4 ++-- mwparserfromhell/parser/tokenizer.py | 42 ++++++++++++++++++++++++++++++++++++ mwparserfromhell/parser/tokens.py | 13 +++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 4636114..a31f638 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -53,7 +53,7 @@ class Builder(object): self._stacks.append([]) def _pop(self, wrap=True): - """Pop the topmost node list off of the stack. + """Pop the current node list off of the stack. If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list. """ @@ -62,7 +62,7 @@ class Builder(object): return self._stacks.pop() def _write(self, item): - """Append a node to the topmost node list.""" + """Append a node to the current node list.""" self._stacks[-1].append(item) def _handle_parameter(self, default): diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index df0166d..58df632 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -32,10 +32,12 @@ from ..compat import htmlentities __all__ = ["Tokenizer"] class BadRoute(Exception): + """Raised internally when the current tokenization route ss invalid.""" pass class Tokenizer(object): + """Creates a list of tokens from a string of wikicode.""" START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", @@ -50,10 +52,12 @@ class Tokenizer(object): @property def _stack(self): + """The current token stack.""" return self._stacks[-1][0] @property def _context(self): + """The current token context.""" return self._stacks[-1][1] @_context.setter @@ -62,6 +66,7 @@ class Tokenizer(object): @property def _textbuffer(self): + """Return the current textbuffer.""" return self._stacks[-1][2] @_textbuffer.setter @@ -69,35 +74,57 @@ class Tokenizer(object): self._stacks[-1][2] = value def _push(self, context=0): + """Add a new token stack, context, and textbuffer to the list.""" self._stacks.append([[], context, []]) def _push_textbuffer(self): + """Push the textbuffer onto the stack as a Text node and clear it.""" if self._textbuffer: self._stack.append(tokens.Text(text="".join(self._textbuffer))) self._textbuffer = [] def _pop(self): + """Pop the current stack/context/textbuffer, returing the stack.""" self._push_textbuffer() return self._stacks.pop()[0] def _fail_route(self): + """Fail the current tokenization route. + + Discards the current stack/context/textbuffer and raises + :py:exc:`~mwparserfromhell.parser.tokenizer.BadRoute`. + """ self._pop() raise BadRoute() def _write(self, token): + """Write a token to the current token stack.""" self._push_textbuffer() self._stack.append(token) def _write_text(self, text): + """Write text to the current textbuffer.""" self._textbuffer.append(text) def _write_all(self, tokenlist): + """Write a series of tokens to the current stack at once.""" if tokenlist and isinstance(tokenlist[0], tokens.Text): self._write_text(tokenlist.pop(0).text) self._push_textbuffer() self._stack.extend(tokenlist) def _read(self, delta=0, wrap=False, strict=False): + """Read the value at a relative point in the wikicode. + + The value is read from :py:attr:`self._head <_head>` plus the value of + *delta* (which can be negative). If *wrap* is ``False``, we will not + allow attempts to read from the end of the string if ``self._head + + delta`` is negative. If *strict* is ``True``, the route will be failed + (with ``:py:meth:`_fail_route`) if we try to read from past the end of + the string; otherwise, :py:attr:`self.END ` is returned. If we try + to read from before the start of the string, :py:attr:`self.START + ` is returned. + """ index = self._head + delta if index < 0 and (not wrap or abs(index) > len(self._text)): return self.START @@ -109,6 +136,7 @@ class Tokenizer(object): return self.END def _parse_template(self): + """Parse a template at the head of the wikicode string.""" reset = self._head self._head += 2 try: @@ -122,6 +150,11 @@ class Tokenizer(object): self._write(tokens.TemplateClose()) def _verify_template_name(self): + """Verify that a template's name is valid wikisyntax. + + The route will be failed if the name contains a newline inside of it + (not merely at the beginning or end). + """ self._push_textbuffer() if self._stack: text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] @@ -130,6 +163,7 @@ class Tokenizer(object): self._fail_route() def _handle_template_param(self): + """Handle a template parameter at the head of the string.""" if self._context & contexts.TEMPLATE_NAME: self._verify_template_name() self._context ^= contexts.TEMPLATE_NAME @@ -139,17 +173,20 @@ class Tokenizer(object): self._write(tokens.TemplateParamSeparator()) def _handle_template_param_value(self): + """Handle a template parameter's value at the head of the string.""" self._context ^= contexts.TEMPLATE_PARAM_KEY self._context |= contexts.TEMPLATE_PARAM_VALUE self._write(tokens.TemplateParamEquals()) def _handle_template_end(self): + """Handle the end of the template at the head of the string.""" if self._context & contexts.TEMPLATE_NAME: self._verify_template_name() self._head += 1 return self._pop() def _parse_heading(self): + """Parse a section heading at the head of the wikicode string.""" self._global |= contexts.GL_HEADING reset = self._head self._head += 1 @@ -174,6 +211,7 @@ class Tokenizer(object): self._global ^= contexts.GL_HEADING def _handle_heading_end(self): + """Handle the end of a section heading at the head of the string.""" reset = self._head self._head += 1 best = 1 @@ -196,6 +234,7 @@ class Tokenizer(object): return self._pop(), after_level def _really_parse_entity(self): + """Actually parse a HTML entity and ensure that it is valid.""" self._write(tokens.HTMLEntityStart()) self._head += 1 @@ -237,6 +276,7 @@ class Tokenizer(object): self._write(tokens.HTMLEntityEnd()) def _parse_entity(self): + """Parse a HTML entity at the head of the wikicode string.""" reset = self._head self._push() try: @@ -248,6 +288,7 @@ class Tokenizer(object): self._write_all(self._pop()) def _parse(self, context=0): + """Parse the wikicode string, using *context* for when to stop.""" self._push(context) while True: this = self._read() @@ -281,6 +322,7 @@ class Tokenizer(object): self._head += 1 def tokenize(self, text): + """Build a list of tokens from a string of wikicode and return it.""" split = self.regex.split(text) self._text = [segment for segment in split if segment] return self._parse() diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index f687170..c2b7df8 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -20,6 +20,16 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +""" +This module contains the token definitions that are used as an intermediate +parsing data type - they are stored in a flat list, with each token being +identified by its type and optional attributes. The token list is generated in +a syntactically valid form by the +:py:class:`~mwparserfromhell.parser.tokenizer.Tokenizer`, and then converted +into the :py:class`~mwparserfromhell.wikicode.Wikicode` tree by the +:py:class:`~mwparserfromhell.parser.builder.Builder`. +""" + from __future__ import unicode_literals from ..compat import basestring, py3k @@ -27,6 +37,8 @@ from ..compat import basestring, py3k __all__ = ["Token"] class Token(object): + """A token represents the semantic meaning of a unit of wikicode.""" + def __init__(self, **kwargs): super(Token, self).__setattr__("_kwargs", kwargs) @@ -54,6 +66,7 @@ class Token(object): del self._kwargs[key] def make(name): + """Create a new Token class using ``type()`` and add it to ``__all__``.""" __all__.append(name) return type(name if py3k else name.encode("utf8"), (Token,), {})