diff --git a/CHANGELOG b/CHANGELOG
index 67214fa..4663700 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,8 @@
+v0.3.1 (released August 29, 2013):
+
+- Fixed a parser bug involving URLs nested inside other markup.
+- Fixed some typos.
+
v0.3 (released August 24, 2013):
- Added complete support for HTML Tags, including forms like [foo],
diff --git a/README.rst b/README.rst
index b5fd912..5b4cfe1 100644
--- a/README.rst
+++ b/README.rst
@@ -9,8 +9,8 @@ mwparserfromhell
that provides an easy-to-use and outrageously powerful parser for MediaWiki_
wikicode. It supports Python 2 and Python 3.
-Developed by Earwig_ with help from `Σ`_. Full documentation is available on
-ReadTheDocs_.
+Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others.
+Full documentation is available on ReadTheDocs_. Development occurs on GitHub_.
Installation
------------
@@ -148,6 +148,8 @@ following code (via the API_)::
.. _ReadTheDocs: http://mwparserfromhell.readthedocs.org
.. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig
.. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3
+.. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm
+.. _GitHub: https://github.com/earwig/mwparserfromhell
.. _Python Package Index: http://pypi.python.org
.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat
.. _get pip: http://pypi.python.org/pypi/pip
diff --git a/docs/changelog.rst b/docs/changelog.rst
index b6db9d9..3546f0c 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,6 +1,15 @@
Changelog
=========
+v0.3.1
+------
+
+`Released August 29, 2013 `_
+(`changes `__):
+
+- Fixed a parser bug involving URLs nested inside other markup.
+- Fixed some typos.
+
v0.3
----
diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py
index 6a45a11..a5fda7c 100644
--- a/mwparserfromhell/__init__.py
+++ b/mwparserfromhell/__init__.py
@@ -31,7 +31,7 @@ from __future__ import unicode_literals
__author__ = "Ben Kurtovic"
__copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic"
__license__ = "MIT License"
-__version__ = "0.3"
+__version__ = "0.3.1"
__email__ = "ben.kurtovic@verizon.net"
from . import (compat, definitions, nodes, parser, smart_list, string_mixin,
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index c9527ab..609a595 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -982,7 +982,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
return 0;
}
Py_DECREF(scheme);
- if (Tokenizer_push(self, LC_EXT_LINK_URI)) {
+ if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
@@ -1028,6 +1028,24 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
}
/*
+ Return whether the current head is the end of a free link.
+*/
+static int
+Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
+{
+ // Built from Tokenizer_parse()'s end sentinels:
+ Py_UNICODE after = Tokenizer_READ(self, 2);
+ int ctx = self->topstack->context;
+
+ return (this == *"" || this == *"\n" || this == *"[" || this == *"]" ||
+ this == *"<" || this == *">" || (this == *"'" && next == *"'") ||
+ (this == *"|" && ctx & LC_TEMPLATE) ||
+ (this == *"=" && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
+ (this == *"}" && next == *"}" &&
+ (ctx & LC_TEMPLATE || (after == *"}" && ctx & LC_ARGUMENT))));
+}
+
+/*
Really parse an external link.
*/
static PyObject*
@@ -1050,35 +1068,31 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
while (1) {
this = Tokenizer_READ(self, 0);
next = Tokenizer_READ(self, 1);
- if (this == *"" || this == *"\n") {
- if (brackets)
- return Tokenizer_fail_route(self);
- self->head--;
- return Tokenizer_pop(self);
- }
- if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) {
+ if (this == *"&") {
PUSH_TAIL_BUFFER(*extra, NULL)
- if (Tokenizer_parse_template_or_argument(self))
+ if (Tokenizer_parse_entity(self))
return NULL;
}
- else if (this == *"[") {
- if (!brackets) {
- self->head--;
- return Tokenizer_pop(self);
- }
- if (Tokenizer_emit_char(self, *"["))
+ else if (this == *"<" && next == *"!"
+ && Tokenizer_READ(self, 2) == *"-"
+ && Tokenizer_READ(self, 3) == *"-") {
+ PUSH_TAIL_BUFFER(*extra, NULL)
+ if (Tokenizer_parse_comment(self))
return NULL;
}
- else if (this == *"]") {
- if (!brackets)
- self->head--;
+ else if (!brackets && Tokenizer_is_free_link(self, this, next)) {
+ self->head--;
return Tokenizer_pop(self);
}
- else if (this == *"&") {
+ else if (this == *"" || this == *"\n")
+ return Tokenizer_fail_route(self);
+ else if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) {
PUSH_TAIL_BUFFER(*extra, NULL)
- if (Tokenizer_parse_entity(self))
+ if (Tokenizer_parse_template_or_argument(self))
return NULL;
}
+ else if (this == *"]")
+ return Tokenizer_pop(self);
else if (this == *" ") {
if (brackets) {
if (Tokenizer_emit(self, ExternalLinkSeparator))
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index da3c57a..48bdf26 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -261,6 +261,7 @@ static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static int Tokenizer_parse_entity(Tokenizer*);
+static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_handle_dl_term(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 8fae729..eb4c571 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -358,7 +358,7 @@ class Tokenizer(object):
slashes = self._read() == self._read(1) == "/"
if not is_scheme(scheme, slashes):
raise BadRoute()
- self._push(contexts.EXT_LINK_URI)
+ self._push(self._context | contexts.EXT_LINK_URI)
self._emit_text(scheme)
self._emit_text(":")
if slashes:
@@ -385,6 +385,18 @@ class Tokenizer(object):
self._emit_text(this)
return punct, tail
+ def _is_free_link_end(self, this, next):
+ """Return whether the current head is the end of a free link."""
+ # Built from _parse()'s end sentinels:
+ after, ctx = self._read(2), self._context
+ equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
+ return (this in (self.END, "\n", "[", "]", "<", ">") or
+ this == next == "'" or
+ (this == "|" and ctx & contexts.TEMPLATE) or
+ (this == "=" and ctx & equal_sign_contexts) or
+ (this == next == "}" and ctx & contexts.TEMPLATE) or
+ (this == next == after == "}" and ctx & contexts.ARGUMENT))
+
def _really_parse_external_link(self, brackets):
"""Really parse an external link."""
if brackets:
@@ -399,27 +411,28 @@ class Tokenizer(object):
tail = ""
while True:
this, next = self._read(), self._read(1)
- if this is self.END or this == "\n":
- if brackets:
- self._fail_route()
+ if this == "&":
+ if tail:
+ self._emit_text(tail)
+ tail = ""
+ self._parse_entity()
+ elif (this == "<" and next == "!" and self._read(2) ==
+ self._read(3) == "-"):
+ if tail:
+ self._emit_text(tail)
+ tail = ""
+ self._parse_comment()
+ elif not brackets and self._is_free_link_end(this, next):
return self._pop(), tail, -1
+ elif this is self.END or this == "\n":
+ self._fail_route()
elif this == next == "{" and self._can_recurse():
if tail:
self._emit_text(tail)
tail = ""
self._parse_template_or_argument()
- elif this == "[":
- if brackets:
- self._emit_text("[")
- else:
- return self._pop(), tail, -1
elif this == "]":
- return self._pop(), tail, 0 if brackets else -1
- elif this == "&":
- if tail:
- self._emit_text(tail)
- tail = ""
- self._parse_entity()
+ return self._pop(), tail, 0
elif " " in this:
before, after = this.split(" ", 1)
if brackets:
diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py
index c3249d9..08fd469 100644
--- a/mwparserfromhell/wikicode.py
+++ b/mwparserfromhell/wikicode.py
@@ -409,7 +409,7 @@ class Wikicode(StringMixIn):
Each section contains all of its subsections. If *levels* is given, it
should be a iterable of integers; only sections whose heading levels
- are within it will be returned.If *matches* is given, it should be a
+ are within it will be returned. If *matches* is given, it should be a
regex to be matched against the titles of section headings; only
sections whose headings match the regex will be included. *flags* can
be used to override the default regex flags (see :py:meth:`ifilter`) if
diff --git a/setup.py b/setup.py
index 3ef7e0e..d2ad17d 100644
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@ setup(
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.3",
"Topic :: Text Processing :: Markup"
],
diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest
index 083b12c..37ef9f1 100644
--- a/tests/tokenizer/integration.mwtest
+++ b/tests/tokenizer/integration.mwtest
@@ -72,3 +72,81 @@ name: link_inside_dl_2
label: an external link inside a def list, such that the external link is not parsed
input: ";;;malito:example"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")]
+
+---
+
+name: link_inside_template
+label: an external link nested inside a template, before the end
+input: "{{URL|http://example.com}}"
+output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateClose()]
+
+---
+
+name: link_inside_template_2
+label: an external link nested inside a template, before a separator
+input: "{{URL|http://example.com|foobar}}"
+output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamSeparator(), Text(text="foobar"), TemplateClose()]
+
+---
+
+name: link_inside_template_3
+label: an external link nested inside a template, before an equal sign
+input: "{{URL|http://example.com=foobar}}"
+output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamEquals(), Text(text="foobar"), TemplateClose()]
+
+---
+
+name: link_inside_argument
+label: an external link nested inside an argument
+input: "{{{URL|http://example.com}}}"
+output: [ArgumentOpen(), Text(text="URL"), ArgumentSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), ArgumentClose()]
+
+---
+
+name: link_inside_heading
+label: an external link nested inside a heading
+input: "==http://example.com=="
+output: [HeadingStart(level=2), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), HeadingEnd()]
+
+---
+
+name: link_inside_tag_body
+label: an external link nested inside the body of a tag
+input: "[http://example.com]"
+output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
+
+---
+
+name: link_inside_tag_style
+label: an external link nested inside style tags
+input: "''http://example.com''"
+output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]
+
+---
+
+name: style_tag_inside_link
+label: style tags disrupting an external link
+input: "http://example.com/foo''bar''"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()]
+
+---
+
+name: comment_inside_link
+label: an HTML comment inside an external link
+input: "http://example.com/foobar"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()]
+
+---
+
+name: bracketed_link_inside_template
+label: a bracketed external link nested inside a template, before the end
+input: "{{URL|[http://example.com}}]"
+output: [Text(text="{{URL|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com}}"), ExternalLinkClose()]
+
+
+---
+
+name: comment_inside_bracketed_link
+label: an HTML comment inside a bracketed external link
+input: "[http://example.com/foobar]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()]