Merge branch 'develop'

11 years ago · 255dfd4a82
--- a/+ 5
+++ b/+ 5
@@ -1,3 +1,8 @@
 v0.3.1 (released August 29, 2013):

 - Fixed a parser bug involving URLs nested inside other markup.
 - Fixed some typos.

 v0.3 (released August 24, 2013):

 - Added complete support for HTML Tags, including forms like <ref>foo</ref>,
--- a/README.rst
+++ b/README.rst
@@ -9,8 +9,8 @@ mwparserfromhell
 that provides an easy-to-use and outrageously powerful parser for MediaWiki_
 wikicode. It supports Python 2 and Python 3.

 Developed by Earwig_ with help from `Σ`_. Full documentation is available on
 ReadTheDocs_.
 Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others.
 Full documentation is available on ReadTheDocs_. Development occurs on GitHub_.

 Installation
 ------------
@@ -148,6 +148,8 @@ following code (via the API_)::
 .. _ReadTheDocs:            http://mwparserfromhell.readthedocs.org
 .. _Earwig:                 http://en.wikipedia.org/wiki/User:The_Earwig
 .. _Σ:                      http://en.wikipedia.org/wiki/User:%CE%A3
 .. _Legoktm:                http://en.wikipedia.org/wiki/User:Legoktm
 .. _GitHub:                 https://github.com/earwig/mwparserfromhell
 .. _Python Package Index:   http://pypi.python.org
 .. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat
 .. _get pip:                http://pypi.python.org/pypi/pip
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,6 +1,15 @@
 Changelog
 =========

 v0.3.1
 ------

 `Released August 29, 2013 <https://github.com/earwig/mwparserfromhell/tree/v0.3.1>`_
 (`changes <https://github.com/earwig/mwparserfromhell/compare/v0.3...v0.3.1>`__):

 - Fixed a parser bug involving URLs nested inside other markup.
 - Fixed some typos.

 v0.3
 ----

--- a/mwparserfromhell/init.py
+++ b/mwparserfromhell/init.py
@@ -31,7 +31,7 @@ from __future__ import unicode_literals
 __author__ = "Ben Kurtovic"
 __copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic"
 __license__ = "MIT License"
 __version__ = "0.3"
 __version__ = "0.3.1"
 __email__ = "ben.kurtovic@verizon.net"

 from . import (compat, definitions, nodes, parser, smart_list, string_mixin,
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -982,7 +982,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
        return 0;
    }
    Py_DECREF(scheme);
    if (Tokenizer_push(self, LC_EXT_LINK_URI)) {
    if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) {
        Textbuffer_dealloc(scheme_buffer);
        return -1;
    }
@@ -1028,6 +1028,24 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
 }

 /*
    Return whether the current head is the end of a free link.
 */
 static int
 Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
 {
    // Built from Tokenizer_parse()'s end sentinels:
    Py_UNICODE after = Tokenizer_READ(self, 2);
    int ctx = self->topstack->context;

    return (this == *"" || this == *"\n" || this == *"[" || this == *"]" ||
        this == *"<" || this == *">"  || (this == *"'" && next == *"'") ||
        (this == *"|" && ctx & LC_TEMPLATE) ||
        (this == *"=" && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
        (this == *"}" && next == *"}" &&
            (ctx & LC_TEMPLATE || (after == *"}" && ctx & LC_ARGUMENT))));
 }

 /*
    Really parse an external link.
 */
 static PyObject*
@@ -1050,35 +1068,31 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
    while (1) {
        this = Tokenizer_READ(self, 0);
        next = Tokenizer_READ(self, 1);
        if (this == *"" || this == *"\n") {
            if (brackets)
                return Tokenizer_fail_route(self);
            self->head--;
            return Tokenizer_pop(self);
        }
        if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) {
        if (this == *"&") {
            PUSH_TAIL_BUFFER(*extra, NULL)
            if (Tokenizer_parse_template_or_argument(self))
            if (Tokenizer_parse_entity(self))
                return NULL;
        }
        else if (this == *"[") {
            if (!brackets) {
                self->head--;
                return Tokenizer_pop(self);
            }
            if (Tokenizer_emit_char(self, *"["))
        else if (this == *"<" && next == *"!"
                 && Tokenizer_READ(self, 2) == *"-"
                 && Tokenizer_READ(self, 3) == *"-") {
            PUSH_TAIL_BUFFER(*extra, NULL)
            if (Tokenizer_parse_comment(self))
                return NULL;
        }
        else if (this == *"]") {
            if (!brackets)
                self->head--;
        else if (!brackets && Tokenizer_is_free_link(self, this, next)) {
            self->head--;
            return Tokenizer_pop(self);
        }
        else if (this == *"&") {
        else if (this == *"" || this == *"\n")
            return Tokenizer_fail_route(self);
        else if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) {
            PUSH_TAIL_BUFFER(*extra, NULL)
            if (Tokenizer_parse_entity(self))
            if (Tokenizer_parse_template_or_argument(self))
                return NULL;
        }
        else if (this == *"]")
            return Tokenizer_pop(self);
        else if (this == *" ") {
            if (brackets) {
                if (Tokenizer_emit(self, ExternalLinkSeparator))
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -261,6 +261,7 @@ static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static void Tokenizer_dealloc(Tokenizer*);
 static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
 static int Tokenizer_parse_entity(Tokenizer*);
 static int Tokenizer_parse_comment(Tokenizer*);
 static int Tokenizer_handle_dl_term(Tokenizer*);
 static int Tokenizer_parse_tag(Tokenizer*);
 static PyObject* Tokenizer_parse(Tokenizer*, int, int);
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -358,7 +358,7 @@ class Tokenizer(object):
        slashes = self._read() == self._read(1) == "/"
        if not is_scheme(scheme, slashes):
            raise BadRoute()
        self._push(contexts.EXT_LINK_URI)
        self._push(self._context | contexts.EXT_LINK_URI)
        self._emit_text(scheme)
        self._emit_text(":")
        if slashes:
@@ -385,6 +385,18 @@ class Tokenizer(object):
        self._emit_text(this)
        return punct, tail

    def _is_free_link_end(self, this, next):
        """Return whether the current head is the end of a free link."""
        # Built from _parse()'s end sentinels:
        after, ctx = self._read(2), self._context
        equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
        return (this in (self.END, "\n", "[", "]", "<", ">") or
                this == next == "'" or
                (this == "|" and ctx & contexts.TEMPLATE) or
                (this == "=" and ctx & equal_sign_contexts) or
                (this == next == "}" and ctx & contexts.TEMPLATE) or
                (this == next == after == "}" and ctx & contexts.ARGUMENT))

    def _really_parse_external_link(self, brackets):
        """Really parse an external link."""
        if brackets:
@@ -399,27 +411,28 @@ class Tokenizer(object):
        tail = ""
        while True:
            this, next = self._read(), self._read(1)
            if this is self.END or this == "\n":
                if brackets:
                    self._fail_route()
            if this == "&":
                if tail:
                    self._emit_text(tail)
                    tail = ""
                self._parse_entity()
            elif (this == "<" and next == "!" and self._read(2) ==
                  self._read(3) == "-"):
                if tail:
                    self._emit_text(tail)
                    tail = ""
                self._parse_comment()
            elif not brackets and self._is_free_link_end(this, next):
                return self._pop(), tail, -1
            elif this is self.END or this == "\n":
                self._fail_route()
            elif this == next == "{" and self._can_recurse():
                if tail:
                    self._emit_text(tail)
                    tail = ""
                self._parse_template_or_argument()
            elif this == "[":
                if brackets:
                    self._emit_text("[")
                else:
                    return self._pop(), tail, -1
            elif this == "]":
                return self._pop(), tail, 0 if brackets else -1
            elif this == "&":
                if tail:
                    self._emit_text(tail)
                    tail = ""
                self._parse_entity()
                return self._pop(), tail, 0
            elif " " in this:
                before, after = this.split(" ", 1)
                if brackets:
--- a/mwparserfromhell/wikicode.py
+++ b/mwparserfromhell/wikicode.py
@@ -409,7 +409,7 @@ class Wikicode(StringMixIn):

        Each section contains all of its subsections. If *levels* is given, it
        should be a iterable of integers; only sections whose heading levels
        are within it will be returned.If *matches* is given, it should be a
        are within it will be returned. If *matches* is given, it should be a
        regex to be matched against the titles of section headings; only
        sections whose headings match the regex will be included. *flags* can
        be used to override the default regex flags (see :py:meth:`ifilter`) if
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@ setup(
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 2.7",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.3",
        "Topic :: Text Processing :: Markup"
    ],
--- a/tests/tokenizer/integration.mwtest
+++ b/tests/tokenizer/integration.mwtest
@@ -72,3 +72,81 @@ name:   link_inside_dl_2
 label:  an external link inside a def list, such that the external link is not parsed
 input:  ";;;malito:example"
 output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")]

 ---

 name:   link_inside_template
 label:  an external link nested inside a template, before the end
 input:  "{{URL|http://example.com}}"
 output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateClose()]

 ---

 name:   link_inside_template_2
 label:  an external link nested inside a template, before a separator
 input:  "{{URL|http://example.com|foobar}}"
 output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamSeparator(), Text(text="foobar"), TemplateClose()]

 ---

 name:   link_inside_template_3
 label:  an external link nested inside a template, before an equal sign
 input:  "{{URL|http://example.com=foobar}}"
 output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamEquals(), Text(text="foobar"), TemplateClose()]

 ---

 name:   link_inside_argument
 label:  an external link nested inside an argument
 input:  "{{{URL|http://example.com}}}"
 output: [ArgumentOpen(), Text(text="URL"), ArgumentSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), ArgumentClose()]

 ---

 name:   link_inside_heading
 label:  an external link nested inside a heading
 input:  "==http://example.com=="
 output: [HeadingStart(level=2), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), HeadingEnd()]

 ---

 name:   link_inside_tag_body
 label:  an external link nested inside the body of a tag
 input:  "<ref>http://example.com</ref>"
 output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]

 ---

 name:   link_inside_tag_style
 label:  an external link nested inside style tags
 input:  "''http://example.com''"
 output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]

 ---

 name:   style_tag_inside_link
 label:  style tags disrupting an external link
 input:  "http://example.com/foo''bar''"
 output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()]

 ---

 name:   comment_inside_link
 label:  an HTML comment inside an external link
 input:  "http://example.com/foo<!--comment-->bar"
 output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()]

 ---

 name:   bracketed_link_inside_template
 label:  a bracketed external link nested inside a template, before the end
 input:  "{{URL|[http://example.com}}]"
 output: [Text(text="{{URL|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com}}"), ExternalLinkClose()]


 ---

 name:   comment_inside_bracketed_link
 label:  an HTML comment inside a bracketed external link
 input:  "[http://example.com/foo<!--comment-->bar]"
 output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()]