@@ -1,3 +1,8 @@ | |||
v0.3.1 (released August 29, 2013): | |||
- Fixed a parser bug involving URLs nested inside other markup. | |||
- Fixed some typos. | |||
v0.3 (released August 24, 2013): | |||
- Added complete support for HTML Tags, including forms like <ref>foo</ref>, | |||
@@ -9,8 +9,8 @@ mwparserfromhell | |||
that provides an easy-to-use and outrageously powerful parser for MediaWiki_ | |||
wikicode. It supports Python 2 and Python 3. | |||
Developed by Earwig_ with help from `Σ`_. Full documentation is available on | |||
ReadTheDocs_. | |||
Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. | |||
Full documentation is available on ReadTheDocs_. Development occurs on GitHub_. | |||
Installation | |||
------------ | |||
@@ -148,6 +148,8 @@ following code (via the API_):: | |||
.. _ReadTheDocs: http://mwparserfromhell.readthedocs.org | |||
.. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig | |||
.. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 | |||
.. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm | |||
.. _GitHub: https://github.com/earwig/mwparserfromhell | |||
.. _Python Package Index: http://pypi.python.org | |||
.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat | |||
.. _get pip: http://pypi.python.org/pypi/pip | |||
@@ -1,6 +1,15 @@ | |||
Changelog | |||
========= | |||
v0.3.1 | |||
------ | |||
`Released August 29, 2013 <https://github.com/earwig/mwparserfromhell/tree/v0.3.1>`_ | |||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.3...v0.3.1>`__): | |||
- Fixed a parser bug involving URLs nested inside other markup. | |||
- Fixed some typos. | |||
v0.3 | |||
---- | |||
@@ -31,7 +31,7 @@ from __future__ import unicode_literals | |||
__author__ = "Ben Kurtovic" | |||
__copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic" | |||
__license__ = "MIT License" | |||
__version__ = "0.3" | |||
__version__ = "0.3.1" | |||
__email__ = "ben.kurtovic@verizon.net" | |||
from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | |||
@@ -982,7 +982,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
return 0; | |||
} | |||
Py_DECREF(scheme); | |||
if (Tokenizer_push(self, LC_EXT_LINK_URI)) { | |||
if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) { | |||
Textbuffer_dealloc(scheme_buffer); | |||
return -1; | |||
} | |||
@@ -1028,6 +1028,24 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, | |||
} | |||
/* | |||
Return whether the current head is the end of a free link. | |||
*/ | |||
static int | |||
Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) | |||
{ | |||
// Built from Tokenizer_parse()'s end sentinels: | |||
Py_UNICODE after = Tokenizer_READ(self, 2); | |||
int ctx = self->topstack->context; | |||
return (this == *"" || this == *"\n" || this == *"[" || this == *"]" || | |||
this == *"<" || this == *">" || (this == *"'" && next == *"'") || | |||
(this == *"|" && ctx & LC_TEMPLATE) || | |||
(this == *"=" && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | |||
(this == *"}" && next == *"}" && | |||
(ctx & LC_TEMPLATE || (after == *"}" && ctx & LC_ARGUMENT)))); | |||
} | |||
/* | |||
Really parse an external link. | |||
*/ | |||
static PyObject* | |||
@@ -1050,35 +1068,31 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||
while (1) { | |||
this = Tokenizer_READ(self, 0); | |||
next = Tokenizer_READ(self, 1); | |||
if (this == *"" || this == *"\n") { | |||
if (brackets) | |||
return Tokenizer_fail_route(self); | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { | |||
if (this == *"&") { | |||
PUSH_TAIL_BUFFER(*extra, NULL) | |||
if (Tokenizer_parse_template_or_argument(self)) | |||
if (Tokenizer_parse_entity(self)) | |||
return NULL; | |||
} | |||
else if (this == *"[") { | |||
if (!brackets) { | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
if (Tokenizer_emit_char(self, *"[")) | |||
else if (this == *"<" && next == *"!" | |||
&& Tokenizer_READ(self, 2) == *"-" | |||
&& Tokenizer_READ(self, 3) == *"-") { | |||
PUSH_TAIL_BUFFER(*extra, NULL) | |||
if (Tokenizer_parse_comment(self)) | |||
return NULL; | |||
} | |||
else if (this == *"]") { | |||
if (!brackets) | |||
self->head--; | |||
else if (!brackets && Tokenizer_is_free_link(self, this, next)) { | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
else if (this == *"&") { | |||
else if (this == *"" || this == *"\n") | |||
return Tokenizer_fail_route(self); | |||
else if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { | |||
PUSH_TAIL_BUFFER(*extra, NULL) | |||
if (Tokenizer_parse_entity(self)) | |||
if (Tokenizer_parse_template_or_argument(self)) | |||
return NULL; | |||
} | |||
else if (this == *"]") | |||
return Tokenizer_pop(self); | |||
else if (this == *" ") { | |||
if (brackets) { | |||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||
@@ -261,6 +261,7 @@ static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | |||
static void Tokenizer_dealloc(Tokenizer*); | |||
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | |||
static int Tokenizer_parse_entity(Tokenizer*); | |||
static int Tokenizer_parse_comment(Tokenizer*); | |||
static int Tokenizer_handle_dl_term(Tokenizer*); | |||
static int Tokenizer_parse_tag(Tokenizer*); | |||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | |||
@@ -358,7 +358,7 @@ class Tokenizer(object): | |||
slashes = self._read() == self._read(1) == "/" | |||
if not is_scheme(scheme, slashes): | |||
raise BadRoute() | |||
self._push(contexts.EXT_LINK_URI) | |||
self._push(self._context | contexts.EXT_LINK_URI) | |||
self._emit_text(scheme) | |||
self._emit_text(":") | |||
if slashes: | |||
@@ -385,6 +385,18 @@ class Tokenizer(object): | |||
self._emit_text(this) | |||
return punct, tail | |||
def _is_free_link_end(self, this, next): | |||
"""Return whether the current head is the end of a free link.""" | |||
# Built from _parse()'s end sentinels: | |||
after, ctx = self._read(2), self._context | |||
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | |||
return (this in (self.END, "\n", "[", "]", "<", ">") or | |||
this == next == "'" or | |||
(this == "|" and ctx & contexts.TEMPLATE) or | |||
(this == "=" and ctx & equal_sign_contexts) or | |||
(this == next == "}" and ctx & contexts.TEMPLATE) or | |||
(this == next == after == "}" and ctx & contexts.ARGUMENT)) | |||
def _really_parse_external_link(self, brackets): | |||
"""Really parse an external link.""" | |||
if brackets: | |||
@@ -399,27 +411,28 @@ class Tokenizer(object): | |||
tail = "" | |||
while True: | |||
this, next = self._read(), self._read(1) | |||
if this is self.END or this == "\n": | |||
if brackets: | |||
self._fail_route() | |||
if this == "&": | |||
if tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_entity() | |||
elif (this == "<" and next == "!" and self._read(2) == | |||
self._read(3) == "-"): | |||
if tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_comment() | |||
elif not brackets and self._is_free_link_end(this, next): | |||
return self._pop(), tail, -1 | |||
elif this is self.END or this == "\n": | |||
self._fail_route() | |||
elif this == next == "{" and self._can_recurse(): | |||
if tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_template_or_argument() | |||
elif this == "[": | |||
if brackets: | |||
self._emit_text("[") | |||
else: | |||
return self._pop(), tail, -1 | |||
elif this == "]": | |||
return self._pop(), tail, 0 if brackets else -1 | |||
elif this == "&": | |||
if tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_entity() | |||
return self._pop(), tail, 0 | |||
elif " " in this: | |||
before, after = this.split(" ", 1) | |||
if brackets: | |||
@@ -409,7 +409,7 @@ class Wikicode(StringMixIn): | |||
Each section contains all of its subsections. If *levels* is given, it | |||
should be a iterable of integers; only sections whose heading levels | |||
are within it will be returned.If *matches* is given, it should be a | |||
are within it will be returned. If *matches* is given, it should be a | |||
regex to be matched against the titles of section headings; only | |||
sections whose headings match the regex will be included. *flags* can | |||
be used to override the default regex flags (see :py:meth:`ifilter`) if | |||
@@ -53,6 +53,7 @@ setup( | |||
"License :: OSI Approved :: MIT License", | |||
"Operating System :: OS Independent", | |||
"Programming Language :: Python :: 2.7", | |||
"Programming Language :: Python :: 3", | |||
"Programming Language :: Python :: 3.3", | |||
"Topic :: Text Processing :: Markup" | |||
], | |||
@@ -72,3 +72,81 @@ name: link_inside_dl_2 | |||
label: an external link inside a def list, such that the external link is not parsed | |||
input: ";;;malito:example" | |||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")] | |||
--- | |||
name: link_inside_template | |||
label: an external link nested inside a template, before the end | |||
input: "{{URL|http://example.com}}" | |||
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateClose()] | |||
--- | |||
name: link_inside_template_2 | |||
label: an external link nested inside a template, before a separator | |||
input: "{{URL|http://example.com|foobar}}" | |||
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamSeparator(), Text(text="foobar"), TemplateClose()] | |||
--- | |||
name: link_inside_template_3 | |||
label: an external link nested inside a template, before an equal sign | |||
input: "{{URL|http://example.com=foobar}}" | |||
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamEquals(), Text(text="foobar"), TemplateClose()] | |||
--- | |||
name: link_inside_argument | |||
label: an external link nested inside an argument | |||
input: "{{{URL|http://example.com}}}" | |||
output: [ArgumentOpen(), Text(text="URL"), ArgumentSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), ArgumentClose()] | |||
--- | |||
name: link_inside_heading | |||
label: an external link nested inside a heading | |||
input: "==http://example.com==" | |||
output: [HeadingStart(level=2), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), HeadingEnd()] | |||
--- | |||
name: link_inside_tag_body | |||
label: an external link nested inside the body of a tag | |||
input: "<ref>http://example.com</ref>" | |||
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||
--- | |||
name: link_inside_tag_style | |||
label: an external link nested inside style tags | |||
input: "''http://example.com''" | |||
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] | |||
--- | |||
name: style_tag_inside_link | |||
label: style tags disrupting an external link | |||
input: "http://example.com/foo''bar''" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()] | |||
--- | |||
name: comment_inside_link | |||
label: an HTML comment inside an external link | |||
input: "http://example.com/foo<!--comment-->bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()] | |||
--- | |||
name: bracketed_link_inside_template | |||
label: a bracketed external link nested inside a template, before the end | |||
input: "{{URL|[http://example.com}}]" | |||
output: [Text(text="{{URL|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com}}"), ExternalLinkClose()] | |||
--- | |||
name: comment_inside_bracketed_link | |||
label: an HTML comment inside a bracketed external link | |||
input: "[http://example.com/foo<!--comment-->bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()] |