Browse Source

Merge branch 'develop'

tags/v0.4
Ben Kurtovic 10 years ago
parent
commit
255dfd4a82
10 changed files with 162 additions and 39 deletions
  1. +5
    -0
      CHANGELOG
  2. +4
    -2
      README.rst
  3. +9
    -0
      docs/changelog.rst
  4. +1
    -1
      mwparserfromhell/__init__.py
  5. +34
    -20
      mwparserfromhell/parser/tokenizer.c
  6. +1
    -0
      mwparserfromhell/parser/tokenizer.h
  7. +28
    -15
      mwparserfromhell/parser/tokenizer.py
  8. +1
    -1
      mwparserfromhell/wikicode.py
  9. +1
    -0
      setup.py
  10. +78
    -0
      tests/tokenizer/integration.mwtest

+ 5
- 0
CHANGELOG View File

@@ -1,3 +1,8 @@
v0.3.1 (released August 29, 2013):

- Fixed a parser bug involving URLs nested inside other markup.
- Fixed some typos.

v0.3 (released August 24, 2013):

- Added complete support for HTML Tags, including forms like <ref>foo</ref>,


+ 4
- 2
README.rst View File

@@ -9,8 +9,8 @@ mwparserfromhell
that provides an easy-to-use and outrageously powerful parser for MediaWiki_
wikicode. It supports Python 2 and Python 3.

Developed by Earwig_ with help from `Σ`_. Full documentation is available on
ReadTheDocs_.
Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others.
Full documentation is available on ReadTheDocs_. Development occurs on GitHub_.

Installation
------------
@@ -148,6 +148,8 @@ following code (via the API_)::
.. _ReadTheDocs: http://mwparserfromhell.readthedocs.org
.. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig
.. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3
.. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm
.. _GitHub: https://github.com/earwig/mwparserfromhell
.. _Python Package Index: http://pypi.python.org
.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat
.. _get pip: http://pypi.python.org/pypi/pip


+ 9
- 0
docs/changelog.rst View File

@@ -1,6 +1,15 @@
Changelog
=========

v0.3.1
------

`Released August 29, 2013 <https://github.com/earwig/mwparserfromhell/tree/v0.3.1>`_
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.3...v0.3.1>`__):

- Fixed a parser bug involving URLs nested inside other markup.
- Fixed some typos.

v0.3
----



+ 1
- 1
mwparserfromhell/__init__.py View File

@@ -31,7 +31,7 @@ from __future__ import unicode_literals
__author__ = "Ben Kurtovic"
__copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic"
__license__ = "MIT License"
__version__ = "0.3"
__version__ = "0.3.1"
__email__ = "ben.kurtovic@verizon.net"

from . import (compat, definitions, nodes, parser, smart_list, string_mixin,


+ 34
- 20
mwparserfromhell/parser/tokenizer.c View File

@@ -982,7 +982,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
return 0;
}
Py_DECREF(scheme);
if (Tokenizer_push(self, LC_EXT_LINK_URI)) {
if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
@@ -1028,6 +1028,24 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
}

/*
Return whether the current head is the end of a free link.
*/
static int
Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
{
// Built from Tokenizer_parse()'s end sentinels:
Py_UNICODE after = Tokenizer_READ(self, 2);
int ctx = self->topstack->context;

return (this == *"" || this == *"\n" || this == *"[" || this == *"]" ||
this == *"<" || this == *">" || (this == *"'" && next == *"'") ||
(this == *"|" && ctx & LC_TEMPLATE) ||
(this == *"=" && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
(this == *"}" && next == *"}" &&
(ctx & LC_TEMPLATE || (after == *"}" && ctx & LC_ARGUMENT))));
}

/*
Really parse an external link.
*/
static PyObject*
@@ -1050,35 +1068,31 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
while (1) {
this = Tokenizer_READ(self, 0);
next = Tokenizer_READ(self, 1);
if (this == *"" || this == *"\n") {
if (brackets)
return Tokenizer_fail_route(self);
self->head--;
return Tokenizer_pop(self);
}
if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) {
if (this == *"&") {
PUSH_TAIL_BUFFER(*extra, NULL)
if (Tokenizer_parse_template_or_argument(self))
if (Tokenizer_parse_entity(self))
return NULL;
}
else if (this == *"[") {
if (!brackets) {
self->head--;
return Tokenizer_pop(self);
}
if (Tokenizer_emit_char(self, *"["))
else if (this == *"<" && next == *"!"
&& Tokenizer_READ(self, 2) == *"-"
&& Tokenizer_READ(self, 3) == *"-") {
PUSH_TAIL_BUFFER(*extra, NULL)
if (Tokenizer_parse_comment(self))
return NULL;
}
else if (this == *"]") {
if (!brackets)
self->head--;
else if (!brackets && Tokenizer_is_free_link(self, this, next)) {
self->head--;
return Tokenizer_pop(self);
}
else if (this == *"&") {
else if (this == *"" || this == *"\n")
return Tokenizer_fail_route(self);
else if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) {
PUSH_TAIL_BUFFER(*extra, NULL)
if (Tokenizer_parse_entity(self))
if (Tokenizer_parse_template_or_argument(self))
return NULL;
}
else if (this == *"]")
return Tokenizer_pop(self);
else if (this == *" ") {
if (brackets) {
if (Tokenizer_emit(self, ExternalLinkSeparator))


+ 1
- 0
mwparserfromhell/parser/tokenizer.h View File

@@ -261,6 +261,7 @@ static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_handle_dl_term(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);


+ 28
- 15
mwparserfromhell/parser/tokenizer.py View File

@@ -358,7 +358,7 @@ class Tokenizer(object):
slashes = self._read() == self._read(1) == "/"
if not is_scheme(scheme, slashes):
raise BadRoute()
self._push(contexts.EXT_LINK_URI)
self._push(self._context | contexts.EXT_LINK_URI)
self._emit_text(scheme)
self._emit_text(":")
if slashes:
@@ -385,6 +385,18 @@ class Tokenizer(object):
self._emit_text(this)
return punct, tail

def _is_free_link_end(self, this, next):
"""Return whether the current head is the end of a free link."""
# Built from _parse()'s end sentinels:
after, ctx = self._read(2), self._context
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
return (this in (self.END, "\n", "[", "]", "<", ">") or
this == next == "'" or
(this == "|" and ctx & contexts.TEMPLATE) or
(this == "=" and ctx & equal_sign_contexts) or
(this == next == "}" and ctx & contexts.TEMPLATE) or
(this == next == after == "}" and ctx & contexts.ARGUMENT))

def _really_parse_external_link(self, brackets):
"""Really parse an external link."""
if brackets:
@@ -399,27 +411,28 @@ class Tokenizer(object):
tail = ""
while True:
this, next = self._read(), self._read(1)
if this is self.END or this == "\n":
if brackets:
self._fail_route()
if this == "&":
if tail:
self._emit_text(tail)
tail = ""
self._parse_entity()
elif (this == "<" and next == "!" and self._read(2) ==
self._read(3) == "-"):
if tail:
self._emit_text(tail)
tail = ""
self._parse_comment()
elif not brackets and self._is_free_link_end(this, next):
return self._pop(), tail, -1
elif this is self.END or this == "\n":
self._fail_route()
elif this == next == "{" and self._can_recurse():
if tail:
self._emit_text(tail)
tail = ""
self._parse_template_or_argument()
elif this == "[":
if brackets:
self._emit_text("[")
else:
return self._pop(), tail, -1
elif this == "]":
return self._pop(), tail, 0 if brackets else -1
elif this == "&":
if tail:
self._emit_text(tail)
tail = ""
self._parse_entity()
return self._pop(), tail, 0
elif " " in this:
before, after = this.split(" ", 1)
if brackets:


+ 1
- 1
mwparserfromhell/wikicode.py View File

@@ -409,7 +409,7 @@ class Wikicode(StringMixIn):

Each section contains all of its subsections. If *levels* is given, it
should be a iterable of integers; only sections whose heading levels
are within it will be returned.If *matches* is given, it should be a
are within it will be returned. If *matches* is given, it should be a
regex to be matched against the titles of section headings; only
sections whose headings match the regex will be included. *flags* can
be used to override the default regex flags (see :py:meth:`ifilter`) if


+ 1
- 0
setup.py View File

@@ -53,6 +53,7 @@ setup(
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.3",
"Topic :: Text Processing :: Markup"
],


+ 78
- 0
tests/tokenizer/integration.mwtest View File

@@ -72,3 +72,81 @@ name: link_inside_dl_2
label: an external link inside a def list, such that the external link is not parsed
input: ";;;malito:example"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")]

---

name: link_inside_template
label: an external link nested inside a template, before the end
input: "{{URL|http://example.com}}"
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateClose()]

---

name: link_inside_template_2
label: an external link nested inside a template, before a separator
input: "{{URL|http://example.com|foobar}}"
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamSeparator(), Text(text="foobar"), TemplateClose()]

---

name: link_inside_template_3
label: an external link nested inside a template, before an equal sign
input: "{{URL|http://example.com=foobar}}"
output: [TemplateOpen(), Text(text="URL"), TemplateParamSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TemplateParamEquals(), Text(text="foobar"), TemplateClose()]

---

name: link_inside_argument
label: an external link nested inside an argument
input: "{{{URL|http://example.com}}}"
output: [ArgumentOpen(), Text(text="URL"), ArgumentSeparator(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), ArgumentClose()]

---

name: link_inside_heading
label: an external link nested inside a heading
input: "==http://example.com=="
output: [HeadingStart(level=2), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), HeadingEnd()]

---

name: link_inside_tag_body
label: an external link nested inside the body of a tag
input: "<ref>http://example.com</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: link_inside_tag_style
label: an external link nested inside style tags
input: "''http://example.com''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: style_tag_inside_link
label: style tags disrupting an external link
input: "http://example.com/foo''bar''"
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: comment_inside_link
label: an HTML comment inside an external link
input: "http://example.com/foo<!--comment-->bar"
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()]

---

name: bracketed_link_inside_template
label: a bracketed external link nested inside a template, before the end
input: "{{URL|[http://example.com}}]"
output: [Text(text="{{URL|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com}}"), ExternalLinkClose()]


---

name: comment_inside_bracketed_link
label: an HTML comment inside a bracketed external link
input: "[http://example.com/foo<!--comment-->bar]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()]

Loading…
Cancel
Save