浏览代码

Some fixes for the parsing of external links (#232)

* Proposed fix for https://github.com/earwig/mwparserfromhell/issues/197

* Port the fix for #197 to the C tokenizer

* Fix parsing of external links where the URL is terminated by some special character

- One existing test case has been found wrong -- current MediaWiki
  version always terminates the URL when an opening bracket is
  encountered.
- Other test cases added: double quote, two single quotes and angles
  always terminate the URL (regardless if it is a free link or external
  link inside brackets). One single quote does not terminate the URL.

* Fix case-insensitive parsing of URI schemes
tags/v0.6.1
Jakub Klinkovský 3 年前
committed by GitHub
父节点
当前提交
bb51e8f282
找不到此签名对应的密钥 GPG 密钥 ID: 4AEE18F83AFDEB23
共有 5 个文件被更改,包括 221 次插入24 次删除
  1. +4
    -1
      mwparserfromhell/nodes/external_link.py
  2. +6
    -3
      mwparserfromhell/parser/builder.py
  3. +89
    -10
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  4. +22
    -8
      mwparserfromhell/parser/tokenizer.py
  5. +100
    -2
      tests/tokenizer/external_links.mwtest

+ 4
- 1
mwparserfromhell/nodes/external_link.py 查看文件

@@ -27,15 +27,18 @@ __all__ = ["ExternalLink"]
class ExternalLink(Node):
"""Represents an external link, like ``[http://example.com/ Example]``."""

def __init__(self, url, title=None, brackets=True):
def __init__(self, url, title=None, brackets=True, suppress_space=False):
super().__init__()
self.url = url
self.title = title
self.brackets = brackets
self.suppress_space = suppress_space

def __str__(self):
if self.brackets:
if self.title is not None:
if self.suppress_space is True:
return "[" + str(self.url) + str(self.title) + "]"
return "[" + str(self.url) + " " + str(self.title) + "]"
return "[" + str(self.url) + "]"
return str(self.url)


+ 6
- 3
mwparserfromhell/parser/builder.py 查看文件

@@ -157,17 +157,20 @@ class Builder:
@_add_handler(tokens.ExternalLinkOpen)
def _handle_external_link(self, token):
"""Handle when an external link is at the head of the tokens."""
brackets, url = token.brackets, None
brackets, url, suppress_space = token.brackets, None, None
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.ExternalLinkSeparator):
url = self._pop()
suppress_space = token.suppress_space
self._push()
elif isinstance(token, tokens.ExternalLinkClose):
if url is not None:
return ExternalLink(url, self._pop(), brackets)
return ExternalLink(self._pop(), brackets=brackets)
return ExternalLink(url, self._pop(), brackets=brackets,
suppress_space=suppress_space is True)
return ExternalLink(self._pop(), brackets=brackets,
suppress_space=suppress_space is True)
else:
self._write(self._handle_token(token))
raise ParserError("_handle_external_link() missed a close token")


+ 89
- 10
mwparserfromhell/parser/ctokenizer/tok_parse.c 查看文件

@@ -30,7 +30,7 @@ SOFTWARE.
#define DIGITS "0123456789"
#define HEXDIGITS "0123456789abcdefABCDEF"
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
#define URISCHEME "abcdefghijklmnopqrstuvwxyz0123456789+.-"
#define URISCHEME "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"

#define MAX_BRACES 255
#define MAX_ENTITY_SIZE 8
@@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
}

/*
Check if the given character is a non-word character.

Equivalent to this Python code:

def is_non_word_character(ch):
if re.fullmatch(r"\W", chunk):
return True
return False
*/
static int is_non_word_character(Py_UCS4 ch)
{
int ret = 0;
PyObject* modname = NULL;
PyObject* module = NULL;
PyObject* fmatch = NULL;
PyObject* pattern = NULL;
PyObject* str = NULL;
PyObject* posArgs = NULL;
PyObject* match = NULL;

modname = PyUnicode_FromString("re");
if (modname == NULL)
goto error;
module = PyImport_Import(modname);
if (module == NULL)
goto error;
fmatch = PyObject_GetAttrString(module, "fullmatch");
if (fmatch == NULL)
goto error;
pattern = PyUnicode_FromString("\\W");
if (pattern == NULL)
goto error;
str = PyUnicode_FROM_SINGLE(ch);
if (str == NULL)
goto error;
posArgs = PyTuple_Pack(2, pattern, str);
if (posArgs == NULL)
goto error;
match = PyObject_Call(fmatch, posArgs, NULL);
if (match == NULL)
goto error;

if (match != Py_None)
ret = 1;
goto end;

error:
ret = -1;
end:
Py_XDECREF(match);
Py_XDECREF(posArgs);
Py_XDECREF(str);
Py_XDECREF(pattern);
Py_XDECREF(fmatch);
Py_XDECREF(module);
Py_XDECREF(modname);
return ret;
}

/*
Parse a template at the head of the wikicode string.
*/
static int Tokenizer_parse_template(Tokenizer* self, int has_content)
@@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
// it was just parsed as text:
for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
chunk = Textbuffer_read(self->topstack->textbuffer, i);
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
// stop at the first non-word character
int is_non_word = is_non_word_character(chunk);
if (is_non_word < 0) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
else if (is_non_word == 1)
goto end_of_loop;
j = 0;
do {
@@ -607,14 +673,15 @@ static int Tokenizer_handle_free_link_text(
Return whether the current head is the end of a free link.
*/
static int
Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
{
// Built from Tokenizer_parse()'s end sentinels:
Py_UCS4 after = Tokenizer_read(self, 2);
uint64_t ctx = self->topstack->context;

return (!this || this == '\n' || this == '[' || this == ']' ||
this == '<' || this == '>' || (this == '\'' && next == '\'') ||
this == '<' || this == '>' || this == '"' ||
(this == '\'' && next == '\'') ||
(this == '|' && ctx & LC_TEMPLATE) ||
(this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
(this == '}' && next == '}' &&
@@ -656,7 +723,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
if (Tokenizer_parse_comment(self))
return NULL;
}
else if (!brackets && Tokenizer_is_free_link(self, this, next)) {
else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) {
self->head--;
return Tokenizer_pop(self);
}
@@ -669,16 +736,28 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
}
else if (this == ']')
return Tokenizer_pop(self);
else if (this == ' ') {
else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) {
if (brackets) {
if (Tokenizer_emit(self, ExternalLinkSeparator))
return NULL;
if (this == ' ') {
if (Tokenizer_emit(self, ExternalLinkSeparator))
return NULL;
}
else {
PyObject* kwargs = PyDict_New();
if (!kwargs)
return NULL;
if (this != ' ')
PyDict_SetItemString(kwargs, "suppress_space", Py_True);
if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs))
return NULL;
}
self->topstack->context ^= LC_EXT_LINK_URI;
self->topstack->context |= LC_EXT_LINK_TITLE;
self->head++;
if (this == ' ')
self->head++;
return Tokenizer_parse(self, 0, 0);
}
if (Textbuffer_write(extra, ' '))
if (Textbuffer_write(extra, this))
return NULL;
return Tokenizer_pop(self);
}


+ 22
- 8
mwparserfromhell/parser/tokenizer.py 查看文件

@@ -366,7 +366,7 @@ class Tokenizer:
self._emit_text("//")
self._head += 2
else:
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
all_valid = lambda: all(char in valid for char in self._read())
scheme = ""
while self._read() is not self.END and all_valid():
@@ -386,14 +386,15 @@ class Tokenizer:

def _parse_free_uri_scheme(self):
"""Parse the URI scheme of a free (no brackets) external link."""
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
scheme = []
try:
# We have to backtrack through the textbuffer looking for our
# scheme since it was just parsed as text:
for chunk in reversed(self._textbuffer):
for char in reversed(chunk):
if char.isspace() or char in self.MARKERS:
# stop at the first non-word character
if re.fullmatch(r"\W", char):
raise StopIteration()
if char not in valid:
raise BadRoute()
@@ -438,7 +439,7 @@ class Tokenizer:
# Built from _parse()'s end sentinels:
after, ctx = self._read(2), self._context
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
return (this in (self.END, "\n", "[", "]", "<", ">") or
return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or
this == nxt == "'" or
(this == "|" and ctx & contexts.TEMPLATE) or
(this == "=" and ctx & equal_sign_contexts) or
@@ -481,16 +482,29 @@ class Tokenizer:
self._parse_template_or_argument()
elif this == "]":
return self._pop(), tail, 0
elif " " in this:
before, after = this.split(" ", 1)
elif this == "'" and nxt == "'":
separator = tokens.ExternalLinkSeparator()
separator.suppress_space = True
self._emit(separator)
self._context ^= contexts.EXT_LINK_URI
self._context |= contexts.EXT_LINK_TITLE
return self._parse(push=False), None, 0
elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">",
"\"")):
before, after = re.split("[ \n\[\]<>\"]", this, maxsplit=1)
delimiter = this[len(before)]
if brackets:
self._emit_text(before)
self._emit(tokens.ExternalLinkSeparator())
separator = tokens.ExternalLinkSeparator()
if delimiter != " ":
separator.suppress_space = True
self._emit(separator)
if after:
self._emit_text(after)
self._context ^= contexts.EXT_LINK_URI
self._context |= contexts.EXT_LINK_TITLE
self._head += 1
if delimiter == " ":
self._head += 1
return self._parse(push=False), None, 0
punct, tail = self._handle_free_link_text(punct, tail, before)
return self._pop(), tail + " " + after, 0


+ 100
- 2
tests/tokenizer/external_links.mwtest 查看文件

@@ -153,9 +153,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext
---

name: brackets_open_bracket_inside
label: an open bracket inside a bracket-enclosed link that is also included
label: an open bracket inside a bracket-enclosed link that is not included
input: "[http://foobar[baz.com Example]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(suppress_space=True), Text(text="[baz.com Example"), ExternalLinkClose()]

---

@@ -478,3 +478,101 @@ name: brackets_scheme_title_but_no_url
label: brackets around a scheme, colon, and slashes, with a title
input: "[http:// Example]"
output: [Text(text="[http:// Example]")]

---

name: url_preceded_by_non_word_character
label: non-word character immediately before a valid URL
input: "svn+ssh://server.domain.com:/reponame"
output: [Text(text="svn+"), ExternalLinkOpen(brackets=False), Text(text="ssh://server.domain.com:/reponame"), ExternalLinkClose()]

---

name: url_preceded_by_underscore
label: underscore immediately before a valid URL
input: "svn_ssh://server.domain.com:/reponame"
output: [Text(text="svn_ssh://server.domain.com:/reponame")]

---

name: url_terminated_by_double_quote
label: a free link terminated by a double quote
input: "http://foo\"bar"
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="\"bar")]

---

name: url_not_terminated_by_single_quote
label: a free link not terminated by a single quote
input: "http://foo'bar"
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo'bar"), ExternalLinkClose()]

---

name: url_terminated_by_two_single_quotes
label: a free link terminated by two single quotes
input: "http://foo''bar''"
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: url_terminated_by_left_angle
label: a free link terminated by a left angle
input: "http://foo<bar"
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="<bar")]

---

name: url_terminated_by_right_angle
label: a free link terminated by a right angle
input: "http://foo>bar"
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text=">bar")]

---

name: brackets_terminated_by_double_quote
label: an external link terminated by a double quote
input: "[http://foo\"bar]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="\"bar"), ExternalLinkClose()]

---

name: brackets_not_terminated_by_single_quote
label: an external link not terminated by a single quote
input: "[http://foo'bar]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo'bar"), ExternalLinkClose()]

---

name: brackets_terminated_by_two_single_quotes
label: an external link terminated by two single quotes
input: "[http://foo''bar'']"
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()]

---

name: brackets_terminated_by_left_angle
label: an external link terminated by a left angle
input: "[http://foo<bar]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="<bar"), ExternalLinkClose()]

---

name: brackets_terminated_by_right_angle
label: an external link terminated by a right angle
input: "[http://foo>bar]"
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text=">bar"), ExternalLinkClose()]

---

name: scheme_case
label: a free link with uppercase letters in the URL scheme
input: "HtTp://example.com/"
output: [ExternalLinkOpen(brackets=False), Text(text="HtTp://example.com/"), ExternalLinkClose()]

---

name: bracket_scheme_case
label: an external link with uppercase letters in the URL scheme
input: "[HtTp://example.com/]"
output: [ExternalLinkOpen(brackets=True), Text(text="HtTp://example.com/"), ExternalLinkClose()]

正在加载...
取消
保存