diff --git a/.gitignore b/.gitignore index ba02a04..d70b37d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ .DS_Store __pycache__ build +dist docs/_build diff --git a/README.rst b/README.rst index ff416e2..77f12c7 100644 --- a/README.rst +++ b/README.rst @@ -28,9 +28,9 @@ Normal usage is rather straightforward (where ``text`` is page text):: >>> import mwparserfromhell >>> wikicode = mwparserfromhell.parse(text) -``wikicode`` is a ``mwparserfromhell.wikicode.Wikicode`` object, which acts -like an ordinary ``unicode`` object (or ``str`` in Python 3) with some extra -methods. For example:: +``wikicode`` is a ``mwparserfromhell.Wikicode`` object, which acts like an +ordinary ``unicode`` object (or ``str`` in Python 3) with some extra methods. +For example:: >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" >>> wikicode = mwparserfromhell.parse(text) @@ -70,7 +70,7 @@ passing ``recursive=True``:: >>> mwparserfromhell.parse(text).filter_templates(recursive=True) ['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}'] -Templates can be easily modified to add, remove, alter or params. ``Wikicode`` +Templates can be easily modified to add, remove, or alter params. ``Wikicode`` can also be treated like a list with ``append()``, ``insert()``, ``remove()``, ``replace()``, and more:: @@ -131,7 +131,7 @@ following code (via the API_):: .. _MediaWiki: http://mediawiki.org .. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig -.. _Σ: http://en.wikipedia.org/wiki/User:Σ +.. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 .. _Python Package Index: http://pypi.python.org .. _get pip: http://pypi.python.org/pypi/pip .. _EarwigBot: https://github.com/earwig/earwigbot diff --git a/docs/api/mwparserfromhell.nodes.rst b/docs/api/mwparserfromhell.nodes.rst index 9db797d..d1016f9 100644 --- a/docs/api/mwparserfromhell.nodes.rst +++ b/docs/api/mwparserfromhell.nodes.rst @@ -17,6 +17,14 @@ nodes Package :undoc-members: :show-inheritance: +:mod:`comment` Module +--------------------- + +.. automodule:: mwparserfromhell.nodes.comment + :members: + :undoc-members: + :show-inheritance: + :mod:`heading` Module --------------------- @@ -56,6 +64,14 @@ nodes Package :undoc-members: :show-inheritance: +:mod:`wikilink` Module +---------------------- + +.. automodule:: mwparserfromhell.nodes.wikilink + :members: + :undoc-members: + :show-inheritance: + Subpackages ----------- diff --git a/docs/conf.py b/docs/conf.py index 2ef841d..c537d37 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -50,7 +50,7 @@ copyright = u'2012 Ben Kurtovic' # The short X.Y version. version = '0.1' # The full version, including alpha/beta/rc tags. -release = '0.1' +release = '0.1.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index 84b4c74..e198783 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,7 +9,7 @@ Developed by Earwig_ with help from `Σ`_. .. _MediaWiki: http://mediawiki.org .. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig -.. _Σ: http://en.wikipedia.org/wiki/User:Σ +.. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 Installation ------------ diff --git a/docs/usage.rst b/docs/usage.rst index c4472f9..2fd19af 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -48,7 +48,7 @@ by passing *recursive=True*:: >>> mwparserfromhell.parse(text).filter_templates(recursive=True) ['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}'] -Templates can be easily modified to add, remove alter or params. +Templates can be easily modified to add, remove, or alter params. :py:class:`~.Wikicode` can also be treated like a list with :py:meth:`~.Wikicode.append`, :py:meth:`~.Wikicode.insert`, :py:meth:`~.Wikicode.remove`, :py:meth:`~.Wikicode.replace`, and more:: diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index d4aaa28..bdf5712 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -31,7 +31,7 @@ from __future__ import unicode_literals __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.1" +__version__ = "0.1.1" __email__ = "ben.kurtovic@verizon.net" from . import nodes, parser, smart_list, string_mixin, wikicode diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index c04f718..86a8746 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -68,7 +68,9 @@ class Node(StringMixIn): from . import extras from .text import Text from .argument import Argument +from .comment import Comment from .heading import Heading from .html_entity import HTMLEntity from .tag import Tag from .template import Template +from .wikilink import Wikilink diff --git a/mwparserfromhell/nodes/comment.py b/mwparserfromhell/nodes/comment.py new file mode 100644 index 0000000..3d06261 --- /dev/null +++ b/mwparserfromhell/nodes/comment.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +from . import Node +from ..compat import str + +__all__ = ["Comment"] + +class Comment(Node): + """Represents a hidden HTML comment, like ````.""" + def __init__(self, contents): + super(Comment, self).__init__() + self._contents = contents + + def __unicode__(self): + return "" + + @property + def contents(self): + """The hidden text contained between ````.""" + return self._contents + + @contents.setter + def contents(self, value): + self._contents = str(value) diff --git a/mwparserfromhell/nodes/heading.py b/mwparserfromhell/nodes/heading.py index 97878b2..8f389d3 100644 --- a/mwparserfromhell/nodes/heading.py +++ b/mwparserfromhell/nodes/heading.py @@ -45,7 +45,7 @@ class Heading(Node): yield self.title, child def __strip__(self, normalize, collapse): - return self.title + return self.title.strip_code(normalize, collapse) def __showtree__(self, write, get, mark): write("=" * self.level) diff --git a/mwparserfromhell/nodes/wikilink.py b/mwparserfromhell/nodes/wikilink.py new file mode 100644 index 0000000..73f2a8d --- /dev/null +++ b/mwparserfromhell/nodes/wikilink.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +from . import Node +from ..compat import str +from ..utils import parse_anything + +__all__ = ["Wikilink"] + +class Wikilink(Node): + """Represents an internal wikilink, like ``[[Foo|Bar]]``.""" + def __init__(self, title, text=None): + super(Wikilink, self).__init__() + self._title = title + self._text = text + + def __unicode__(self): + if self.text is not None: + return "[[" + str(self.title) + "|" + str(self.text) + "]]" + return "[[" + str(self.title) + "]]" + + def __iternodes__(self, getter): + yield None, self + for child in getter(self.title): + yield self.title, child + if self.text is not None: + for child in getter(self.text): + yield self.text, child + + def __strip__(self, normalize, collapse): + if self.text is not None: + return self.text.strip_code(normalize, collapse) + return self.title.strip_code(normalize, collapse) + + def __showtree__(self, write, get, mark): + write("[[") + get(self.title) + if self.text is not None: + write(" | ") + mark() + get(self.text) + write("]]") + + @property + def title(self): + """The title of the linked page, as a :py:class:`~.Wikicode` object.""" + return self._title + + @property + def text(self): + """The text to display (if any), as a :py:class:`~.Wikicode` object.""" + return self._text + + @title.setter + def title(self, value): + self._title = parse_anything(value) + + @text.setter + def text(self, value): + self._text = parse_anything(value) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 94fd57d..61a8209 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -24,7 +24,8 @@ from __future__ import unicode_literals from . import tokens from ..compat import str -from ..nodes import Argument, Heading, HTMLEntity, Tag, Template, Text +from ..nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template, + Text, Wikilink) from ..nodes.extras import Attribute, Parameter from ..smart_list import SmartList from ..wikicode import Wikicode @@ -125,8 +126,24 @@ class Builder(object): else: self._write(self._handle_token(token)) + def _handle_wikilink(self): + """Handle a case where a wikilink is at the head of the tokens.""" + title = None + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.WikilinkSeparator): + title = self._pop() + self._push() + elif isinstance(token, tokens.WikilinkClose): + if title is not None: + return Wikilink(title, self._pop()) + return Wikilink(self._pop()) + else: + self._write(self._handle_token(token)) + def _handle_entity(self): - """Handle a case where a HTML entity is at the head of the tokens.""" + """Handle a case where an HTML entity is at the head of the tokens.""" token = self._tokens.pop() if isinstance(token, tokens.HTMLEntityNumeric): token = self._tokens.pop() @@ -152,6 +169,17 @@ class Builder(object): else: self._write(self._handle_token(token)) + def _handle_comment(self): + """Handle a case where a hidden comment is at the head of the tokens.""" + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.CommentEnd): + contents = self._pop() + return Comment(contents) + else: + self._write(self._handle_token(token)) + def _handle_attribute(self): """Handle a case where a tag attribute is at the head of the tokens.""" name, quoted = None, False @@ -205,10 +233,14 @@ class Builder(object): return self._handle_template() elif isinstance(token, tokens.ArgumentOpen): return self._handle_argument() + elif isinstance(token, tokens.WikilinkOpen): + return self._handle_wikilink() elif isinstance(token, tokens.HTMLEntityStart): return self._handle_entity() elif isinstance(token, tokens.HeadingStart): return self._handle_heading(token) + elif isinstance(token, tokens.CommentStart): + return self._handle_comment() elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 5969239..9d41870 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -35,49 +35,62 @@ will cover ``BAR == 0b10`` and ``BAZ == 0b01``). Local (stack-specific) contexts: -* :py:const:`TEMPLATE` (``0b00000000111``) +* :py:const:`TEMPLATE` - * :py:const:`TEMPLATE_NAME` (``0b00000000001``) - * :py:const:`TEMPLATE_PARAM_KEY` (``0b00000000010``) - * :py:const:`TEMPLATE_PARAM_VALUE` (``0b00000000100``) + * :py:const:`TEMPLATE_NAME` + * :py:const:`TEMPLATE_PARAM_KEY` + * :py:const:`TEMPLATE_PARAM_VALUE` -* :py:const:`ARGUMENT` (``0b00000011000``) +* :py:const:`ARGUMENT` - * :py:const:`ARGUMENT_NAME` (``0b00000001000``) - * :py:const:`ARGUMENT_DEFAULT` (``0b00000010000``) + * :py:const:`ARGUMENT_NAME` + * :py:const:`ARGUMENT_DEFAULT` -* :py:const:`HEADING` (``0b111111000``) +* :py:const:`WIKILINK` - * :py:const:`HEADING_LEVEL_1` (``0b00000100000``) - * :py:const:`HEADING_LEVEL_2` (``0b00001000000``) - * :py:const:`HEADING_LEVEL_3` (``0b00010000000``) - * :py:const:`HEADING_LEVEL_4` (``0b00100000000``) - * :py:const:`HEADING_LEVEL_5` (``0b01000000000``) - * :py:const:`HEADING_LEVEL_6` (``0b10000000000``) + * :py:const:`WIKILINK_TITLE` + * :py:const:`WIKILINK_TEXT` + +* :py:const:`HEADING` + + * :py:const:`HEADING_LEVEL_1` + * :py:const:`HEADING_LEVEL_2` + * :py:const:`HEADING_LEVEL_3` + * :py:const:`HEADING_LEVEL_4` + * :py:const:`HEADING_LEVEL_5` + * :py:const:`HEADING_LEVEL_6` + +* :py:const:`COMMENT` Global contexts: -* :py:const:`GL_HEADING` (``0b1``) +* :py:const:`GL_HEADING` """ # Local contexts: -TEMPLATE = 0b00000000111 -TEMPLATE_NAME = 0b00000000001 -TEMPLATE_PARAM_KEY = 0b00000000010 -TEMPLATE_PARAM_VALUE = 0b00000000100 - -ARGUMENT = 0b00000011000 -ARGUMENT_NAME = 0b00000001000 -ARGUMENT_DEFAULT = 0b00000010000 - -HEADING = 0b11111100000 -HEADING_LEVEL_1 = 0b00000100000 -HEADING_LEVEL_2 = 0b00001000000 -HEADING_LEVEL_3 = 0b00010000000 -HEADING_LEVEL_4 = 0b00100000000 -HEADING_LEVEL_5 = 0b01000000000 -HEADING_LEVEL_6 = 0b10000000000 +TEMPLATE = 0b00000000000111 +TEMPLATE_NAME = 0b00000000000001 +TEMPLATE_PARAM_KEY = 0b00000000000010 +TEMPLATE_PARAM_VALUE = 0b00000000000100 + +ARGUMENT = 0b00000000011000 +ARGUMENT_NAME = 0b00000000001000 +ARGUMENT_DEFAULT = 0b00000000010000 + +WIKILINK = 0b00000001100000 +WIKILINK_TITLE = 0b00000000100000 +WIKILINK_TEXT = 0b00000001000000 + +HEADING = 0b01111110000000 +HEADING_LEVEL_1 = 0b00000010000000 +HEADING_LEVEL_2 = 0b00000100000000 +HEADING_LEVEL_3 = 0b00001000000000 +HEADING_LEVEL_4 = 0b00010000000000 +HEADING_LEVEL_5 = 0b00100000000000 +HEADING_LEVEL_6 = 0b01000000000000 + +COMMENT = 0b10000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2086214..a8ce88f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -41,8 +41,8 @@ class Tokenizer(object): START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", - "/", "-", "\n", END] - regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE) + "/", "-", "!", "\n", END] + regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) def __init__(self): self._text = None @@ -83,9 +83,18 @@ class Tokenizer(object): self._stack.append(tokens.Text(text="".join(self._textbuffer))) self._textbuffer = [] - def _pop(self): - """Pop the current stack/context/textbuffer, returing the stack.""" + def _pop(self, keep_context=False): + """Pop the current stack/context/textbuffer, returing the stack. + + If *keep_context is ``True``, then we will replace the underlying + stack's context with the current stack's. + """ self._push_textbuffer() + if keep_context: + context = self._context + stack = self._stacks.pop()[0] + self._context = context + return stack return self._stacks.pop()[0] def _fail_route(self): @@ -225,14 +234,23 @@ class Tokenizer(object): if self._context & contexts.TEMPLATE_NAME: self._verify_safe(["\n", "{", "}", "[", "]"]) self._context ^= contexts.TEMPLATE_NAME - if self._context & contexts.TEMPLATE_PARAM_VALUE: + elif self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE + elif self._context & contexts.TEMPLATE_PARAM_KEY: + self._write_all(self._pop(keep_context=True)) self._context |= contexts.TEMPLATE_PARAM_KEY self._write(tokens.TemplateParamSeparator()) + self._push(self._context) def _handle_template_param_value(self): """Handle a template parameter's value at the head of the string.""" - self._verify_safe(["\n", "{{", "}}"]) + try: + self._verify_safe(["\n", "{{", "}}"]) + except BadRoute: + self._pop() + raise + else: + self._write_all(self._pop(keep_context=True)) self._context ^= contexts.TEMPLATE_PARAM_KEY self._context |= contexts.TEMPLATE_PARAM_VALUE self._write(tokens.TemplateParamEquals()) @@ -241,6 +259,8 @@ class Tokenizer(object): """Handle the end of a template at the head of the string.""" if self._context & contexts.TEMPLATE_NAME: self._verify_safe(["\n", "{", "}", "[", "]"]) + elif self._context & contexts.TEMPLATE_PARAM_KEY: + self._write_all(self._pop(keep_context=True)) self._head += 1 return self._pop() @@ -258,6 +278,34 @@ class Tokenizer(object): self._head += 2 return self._pop() + def _parse_wikilink(self): + """Parse an internal wikilink at the head of the wikicode string.""" + self._head += 2 + reset = self._head - 1 + try: + wikilink = self._parse(contexts.WIKILINK_TITLE) + except BadRoute: + self._head = reset + self._write_text("[[") + else: + self._write(tokens.WikilinkOpen()) + self._write_all(wikilink) + self._write(tokens.WikilinkClose()) + + def _handle_wikilink_separator(self): + """Handle the separator between a wikilink's title and its text.""" + self._verify_safe(["\n", "{", "}", "[", "]"]) + self._context ^= contexts.WIKILINK_TITLE + self._context |= contexts.WIKILINK_TEXT + self._write(tokens.WikilinkSeparator()) + + def _handle_wikilink_end(self): + """Handle the end of a wikilink at the head of the string.""" + if self._context & contexts.WIKILINK_TITLE: + self._verify_safe(["\n", "{", "}", "[", "]"]) + self._head += 1 + return self._pop() + def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" self._global |= contexts.GL_HEADING @@ -307,7 +355,7 @@ class Tokenizer(object): return self._pop(), after_level def _really_parse_entity(self): - """Actually parse a HTML entity and ensure that it is valid.""" + """Actually parse an HTML entity and ensure that it is valid.""" self._write(tokens.HTMLEntityStart()) self._head += 1 @@ -349,7 +397,7 @@ class Tokenizer(object): self._write(tokens.HTMLEntityEnd()) def _parse_entity(self): - """Parse a HTML entity at the head of the wikicode string.""" + """Parse an HTML entity at the head of the wikicode string.""" reset = self._head self._push() try: @@ -360,6 +408,21 @@ class Tokenizer(object): else: self._write_all(self._pop()) + def _parse_comment(self): + """Parse an HTML comment at the head of the wikicode string.""" + self._head += 4 + reset = self._head - 1 + try: + comment = self._parse(contexts.COMMENT) + except BadRoute: + self._head = reset + self._write_text(" + TagOpenOpen = make("TagOpenOpen") # < TagAttrStart = make("TagAttrStart") TagAttrEquals = make("TagAttrEquals") # = diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index cebc61b..e0f5acd 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -24,7 +24,7 @@ from __future__ import unicode_literals import re from .compat import maxsize, str -from .nodes import Heading, Node, Tag, Template, Text +from .nodes import Heading, Node, Tag, Template, Text, Wikilink from .string_mixin import StringMixIn from .utils import parse_anything @@ -303,6 +303,14 @@ class Wikicode(StringMixIn): if not matches or re.search(matches, str(node), flags): yield node + def ifilter_links(self, recursive=False, matches=None, flags=FLAGS): + """Iterate over wikilink nodes. + + This is equivalent to :py:meth:`ifilter` with *forcetype* set to + :py:class:`~.Wikilink`. + """ + return self.ifilter(recursive, matches, flags, forcetype=Wikilink) + def ifilter_templates(self, recursive=False, matches=None, flags=FLAGS): """Iterate over template nodes. @@ -335,6 +343,14 @@ class Wikicode(StringMixIn): """ return list(self.ifilter(recursive, matches, flags, forcetype)) + def filter_links(self, recursive=False, matches=None, flags=FLAGS): + """Return a list of wikilink nodes. + + This is equivalent to calling :py:func:`list` on + :py:meth:`ifilter_links`. + """ + return list(self.ifilter_links(recursive, matches, flags)) + def filter_templates(self, recursive=False, matches=None, flags=FLAGS): """Return a list of template nodes.