@@ -1,3 +1,8 @@ | |||||
v0.3.1 (released August 29, 2013): | |||||
- Fixed a parser bug involving URLs nested inside other markup. | |||||
- Fixed some typos. | |||||
v0.3 (released August 24, 2013): | v0.3 (released August 24, 2013): | ||||
- Added complete support for HTML Tags, including forms like <ref>foo</ref>, | - Added complete support for HTML Tags, including forms like <ref>foo</ref>, | ||||
@@ -9,8 +9,8 @@ mwparserfromhell | |||||
that provides an easy-to-use and outrageously powerful parser for MediaWiki_ | that provides an easy-to-use and outrageously powerful parser for MediaWiki_ | ||||
wikicode. It supports Python 2 and Python 3. | wikicode. It supports Python 2 and Python 3. | ||||
Developed by Earwig_ with help from `Σ`_. Full documentation is available on | |||||
ReadTheDocs_. | |||||
Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. | |||||
Full documentation is available on ReadTheDocs_. Development occurs on GitHub_. | |||||
Installation | Installation | ||||
------------ | ------------ | ||||
@@ -148,6 +148,8 @@ following code (via the API_):: | |||||
.. _ReadTheDocs: http://mwparserfromhell.readthedocs.org | .. _ReadTheDocs: http://mwparserfromhell.readthedocs.org | ||||
.. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig | .. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig | ||||
.. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 | .. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 | ||||
.. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm | |||||
.. _GitHub: https://github.com/earwig/mwparserfromhell | |||||
.. _Python Package Index: http://pypi.python.org | .. _Python Package Index: http://pypi.python.org | ||||
.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat | .. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat | ||||
.. _get pip: http://pypi.python.org/pypi/pip | .. _get pip: http://pypi.python.org/pypi/pip | ||||
@@ -1,6 +1,15 @@ | |||||
Changelog | Changelog | ||||
========= | ========= | ||||
v0.3.1 | |||||
------ | |||||
`Released August 29, 2013 <https://github.com/earwig/mwparserfromhell/tree/v0.3.1>`_ | |||||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.3...v0.3.1>`__): | |||||
- Fixed a parser bug involving URLs nested inside other markup. | |||||
- Fixed some typos. | |||||
v0.3 | v0.3 | ||||
---- | ---- | ||||
@@ -31,7 +31,7 @@ from __future__ import unicode_literals | |||||
__author__ = "Ben Kurtovic" | __author__ = "Ben Kurtovic" | ||||
__copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic" | __copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic" | ||||
__license__ = "MIT License" | __license__ = "MIT License" | ||||
__version__ = "0.3" | |||||
__version__ = "0.3.1" | |||||
__email__ = "ben.kurtovic@verizon.net" | __email__ = "ben.kurtovic@verizon.net" | ||||
from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | ||||
@@ -982,7 +982,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||||
return 0; | return 0; | ||||
} | } | ||||
Py_DECREF(scheme); | Py_DECREF(scheme); | ||||
if (Tokenizer_push(self, LC_EXT_LINK_URI)) { | |||||
if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) { | |||||
Textbuffer_dealloc(scheme_buffer); | Textbuffer_dealloc(scheme_buffer); | ||||
return -1; | return -1; | ||||
} | } | ||||
@@ -1028,6 +1028,24 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, | |||||
} | } | ||||
/* | /* | ||||
Return whether the current head is the end of a free link. | |||||
*/ | |||||
static int | |||||
Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) | |||||
{ | |||||
// Built from Tokenizer_parse()'s end sentinels: | |||||
Py_UNICODE after = Tokenizer_READ(self, 2); | |||||
int ctx = self->topstack->context; | |||||
return (this == *"" || this == *"\n" || this == *"[" || this == *"]" || | |||||
this == *"<" || this == *">" || (this == *"'" && next == *"'") || | |||||
(this == *"|" && ctx & LC_TEMPLATE) || | |||||
(this == *"=" && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | |||||
(this == *"}" && next == *"}" && | |||||
(ctx & LC_TEMPLATE || (after == *"}" && ctx & LC_ARGUMENT)))); | |||||
} | |||||
/* | |||||
Really parse an external link. | Really parse an external link. | ||||
*/ | */ | ||||
static PyObject* | static PyObject* | ||||
@@ -1050,35 +1068,31 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||||
while (1) { | while (1) { | ||||
this = Tokenizer_READ(self, 0); | this = Tokenizer_READ(self, 0); | ||||
next = Tokenizer_READ(self, 1); | next = Tokenizer_READ(self, 1); | ||||
if (this == *"" || this == *"\n") { | |||||
if (brackets) | |||||
return Tokenizer_fail_route(self); | |||||
self->head--; | |||||
return Tokenizer_pop(self); | |||||
} | |||||
if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { | |||||
if (this == *"&") { | |||||
PUSH_TAIL_BUFFER(*extra, NULL) | PUSH_TAIL_BUFFER(*extra, NULL) | ||||
if (Tokenizer_parse_template_or_argument(self)) | |||||
if (Tokenizer_parse_entity(self)) | |||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (this == *"[") { | |||||
if (!brackets) { | |||||
self->head--; | |||||
return Tokenizer_pop(self); | |||||
} | |||||
if (Tokenizer_emit_char(self, *"[")) | |||||
else if (this == *"<" && next == *"!" | |||||
&& Tokenizer_READ(self, 2) == *"-" | |||||
&& Tokenizer_READ(self, 3) == *"-") { | |||||
PUSH_TAIL_BUFFER(*extra, NULL) | |||||
if (Tokenizer_parse_comment(self)) | |||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (this == *"]") { | |||||
if (!brackets) | |||||
self->head--; | |||||
else if (!brackets && Tokenizer_is_free_link(self, this, next)) { | |||||
self->head--; | |||||
return Tokenizer_pop(self); | return Tokenizer_pop(self); | ||||
} | } | ||||
else if (this == *"&") { | |||||
else if (this == *"" || this == *"\n") | |||||
return Tokenizer_fail_route(self); | |||||
else if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { | |||||
PUSH_TAIL_BUFFER(*extra, NULL) | PUSH_TAIL_BUFFER(*extra, NULL) | ||||
if (Tokenizer_parse_entity(self)) | |||||
if (Tokenizer_parse_template_or_argument(self)) | |||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (this == *"]") | |||||
return Tokenizer_pop(self); | |||||
else if (this == *" ") { | else if (this == *" ") { | ||||
if (brackets) { | if (brackets) { | ||||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | if (Tokenizer_emit(self, ExternalLinkSeparator)) | ||||
@@ -261,6 +261,7 @@ static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | |||||
static void Tokenizer_dealloc(Tokenizer*); | static void Tokenizer_dealloc(Tokenizer*); | ||||
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | ||||
static int Tokenizer_parse_entity(Tokenizer*); | static int Tokenizer_parse_entity(Tokenizer*); | ||||
static int Tokenizer_parse_comment(Tokenizer*); | |||||
static int Tokenizer_handle_dl_term(Tokenizer*); | static int Tokenizer_handle_dl_term(Tokenizer*); | ||||
static int Tokenizer_parse_tag(Tokenizer*); | static int Tokenizer_parse_tag(Tokenizer*); | ||||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | static PyObject* Tokenizer_parse(Tokenizer*, int, int); | ||||
@@ -358,7 +358,7 @@ class Tokenizer(object): | |||||
slashes = self._read() == self._read(1) == "/" | slashes = self._read() == self._read(1) == "/" | ||||
if not is_scheme(scheme, slashes): | if not is_scheme(scheme, slashes): | ||||
raise BadRoute() | raise BadRoute() | ||||
self._push(contexts.EXT_LINK_URI) | |||||
self._push(self._context | contexts.EXT_LINK_URI) | |||||
self._emit_text(scheme) | self._emit_text(scheme) | ||||
self._emit_text(":") | self._emit_text(":") | ||||
if slashes: | if slashes: | ||||
@@ -385,6 +385,18 @@ class Tokenizer(object): | |||||
self._emit_text(this) | self._emit_text(this) | ||||
return punct, tail | return punct, tail | ||||
def _is_free_link_end(self, this, next): | |||||
"""Return whether the current head is the end of a free link.""" | |||||
# Built from _parse()'s end sentinels: | |||||
after, ctx = self._read(2), self._context | |||||
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | |||||
return (this in (self.END, "\n", "[", "]", "<", ">") or | |||||
this == next == "'" or | |||||
(this == "|" and ctx & contexts.TEMPLATE) or | |||||
(this == "=" and ctx & equal_sign_contexts) or | |||||
(this == next == "}" and ctx & contexts.TEMPLATE) or | |||||
(this == next == after == "}" and ctx & contexts.ARGUMENT)) | |||||
def _really_parse_external_link(self, brackets): | def _really_parse_external_link(self, brackets): | ||||
"""Really parse an external link.""" | """Really parse an external link.""" | ||||
if brackets: | if brackets: | ||||
@@ -399,27 +411,28 @@ class Tokenizer(object): | |||||
tail = "" | tail = "" | ||||
while True: | while True: | ||||
this, next = self._read(), self._read(1) | this, next = self._read(), self._read(1) | ||||
if this is self.END or this == "\n": | |||||
if brackets: | |||||
self._fail_route() | |||||
if this == "&": | |||||
if tail: | |||||
self._emit_text(tail) | |||||
tail = "" | |||||
self._parse_entity() | |||||
elif (this == "<" and next == "!" and self._read(2) == | |||||
self._read(3) == "-"): | |||||
if tail: | |||||
self._emit_text(tail) | |||||
tail = "" | |||||
self._parse_comment() | |||||
elif not brackets and self._is_free_link_end(this, next): | |||||
return self._pop(), tail, -1 | return self._pop(), tail, -1 | ||||
elif this is self.END or this == "\n": | |||||
self._fail_route() | |||||
elif this == next == "{" and self._can_recurse(): | elif this == next == "{" and self._can_recurse(): | ||||
if tail: | if tail: | ||||
self._emit_text(tail) | self._emit_text(tail) | ||||
tail = "" | tail = "" | ||||
self._parse_template_or_argument() | self._parse_template_or_argument() | ||||
elif this == "[": | |||||
if brackets: | |||||
self._emit_text("[") | |||||
else: | |||||
return self._pop(), tail, -1 | |||||
elif this == "]": | elif this == "]": | ||||
return self._pop(), tail, 0 if brackets else -1 | |||||
elif this == "&": | |||||
if tail: | |||||
self._emit_text(tail) | |||||
tail = "" | |||||
self._parse_entity() | |||||
return self._pop(), tail, 0 | |||||
elif " " in this: | elif " " in this: | ||||
before, after = this.split(" ", 1) | before, after = this.split(" ", 1) | ||||
if brackets: | if brackets: | ||||
@@ -409,7 +409,7 @@ class Wikicode(StringMixIn): | |||||
Each section contains all of its subsections. If *levels* is given, it | Each section contains all of its subsections. If *levels* is given, it | ||||
should be a iterable of integers; only sections whose heading levels | should be a iterable of integers; only sections whose heading levels | ||||
are within it will be returned.If *matches* is given, it should be a | |||||
are within it will be returned. If *matches* is given, it should be a | |||||
regex to be matched against the titles of section headings; only | regex to be matched against the titles of section headings; only | ||||
sections whose headings match the regex will be included. *flags* can | sections whose headings match the regex will be included. *flags* can | ||||
be used to override the default regex flags (see :py:meth:`ifilter`) if | be used to override the default regex flags (see :py:meth:`ifilter`) if | ||||
@@ -53,6 +53,7 @@ setup( | |||||
"License :: OSI Approved :: MIT License", | "License :: OSI Approved :: MIT License", | ||||
"Operating System :: OS Independent", | "Operating System :: OS Independent", | ||||
"Programming Language :: Python :: 2.7", | "Programming Language :: Python :: 2.7", | ||||
"Programming Language :: Python :: 3", | |||||
"Programming Language :: Python :: 3.3", | "Programming Language :: Python :: 3.3", | ||||
"Topic :: Text Processing :: Markup" | "Topic :: Text Processing :: Markup" | ||||
], | ], | ||||
@@ -72,3 +72,81 @@ name: link_inside_dl_2 | |||||
label: an external link inside a def list, such that the external link is not parsed | label: an external link inside a def list, such that the external link is not parsed | ||||
input: ";;;malito:example" | input: ";;;malito:example" | ||||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")] | output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")] | ||||
--- | |||||
name: link_inside_template | |||||
label: an external link nested inside a template, before the end | |||||
input: "{{URL|http://example.com}}" | |||||
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateClose()] | |||||
--- | |||||
name: link_inside_template_2 | |||||
label: an external link nested inside a template, before a separator | |||||
input: "{{URL|http://example.com|foobar}}" | |||||
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamSeparator(), Text(text="foobar"), TemplateClose()] | |||||
--- | |||||
name: link_inside_template_3 | |||||
label: an external link nested inside a template, before an equal sign | |||||
input: "{{URL|http://example.com=foobar}}" | |||||
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamEquals(), Text(text="foobar"), TemplateClose()] | |||||
--- | |||||
name: link_inside_argument | |||||
label: an external link nested inside an argument | |||||
input: "{{{URL|http://example.com}}}" | |||||
output: [ArgumentOpen(), Text(text="URL"), ArgumentSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), ArgumentClose()] | |||||
--- | |||||
name: link_inside_heading | |||||
label: an external link nested inside a heading | |||||
input: "==http://example.com==" | |||||
output: [HeadingStart(level=2), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), HeadingEnd()] | |||||
--- | |||||
name: link_inside_tag_body | |||||
label: an external link nested inside the body of a tag | |||||
input: "<ref>http://example.com</ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: link_inside_tag_style | |||||
label: an external link nested inside style tags | |||||
input: "''http://example.com''" | |||||
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] | |||||
--- | |||||
name: style_tag_inside_link | |||||
label: style tags disrupting an external link | |||||
input: "http://example.com/foo''bar''" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()] | |||||
--- | |||||
name: comment_inside_link | |||||
label: an HTML comment inside an external link | |||||
input: "http://example.com/foo<!--comment-->bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()] | |||||
--- | |||||
name: bracketed_link_inside_template | |||||
label: a bracketed external link nested inside a template, before the end | |||||
input: "{{URL|[http://example.com}}]" | |||||
output: [Text(text="{{URL|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com}}"), ExternalLinkClose()] | |||||
--- | |||||
name: comment_inside_bracketed_link | |||||
label: an HTML comment inside a bracketed external link | |||||
input: "[http://example.com/foo<!--comment-->bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()] |