Browse Source

Clean up external links parsing logic and fix integer conversion

tags/v0.6.3
Ben Kurtovic 2 years ago
parent
commit
911d7e5f88
6 changed files with 83 additions and 152 deletions
  1. +1
    -0
      .gitignore
  2. +2
    -0
      CHANGELOG
  3. +3
    -0
      docs/changelog.rst
  4. +30
    -98
      src/mwparserfromhell/parser/ctokenizer/tok_parse.c
  5. +3
    -3
      src/mwparserfromhell/parser/ctokenizer/tokenizer.c
  6. +44
    -51
      src/mwparserfromhell/parser/tokenizer.py

+ 1
- 0
.gitignore View File

@@ -14,3 +14,4 @@ docs/_build
scripts/*.log
htmlcov/
.idea/
.pytest_cache/

+ 2
- 0
CHANGELOG View File

@@ -1,6 +1,8 @@
v0.7 (unreleased):

- Added Linux AArch64 wheels. (#276)
- Fixed C integer conversion, manifesting as parsing errors on big-endian
platforms. (#277)

v0.6.2 (released May 16, 2021):



+ 3
- 0
docs/changelog.rst View File

@@ -9,6 +9,9 @@ Unreleased

- Added Linux AArch64 wheels.
(`#276 <https://github.com/earwig/mwparserfromhell/issues/276>`_)
- Fixed C integer conversion, manifesting as parsing errors on big-endian
platforms.
(`#277 <https://github.com/earwig/mwparserfromhell/issues/277>`_)

v0.6.2
------


+ 30
- 98
src/mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -1,5 +1,5 @@
/*
Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2021 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -100,66 +100,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
}

/*
Check if the given character is a non-word character.

Equivalent to this Python code:

def is_non_word_character(ch):
if re.fullmatch(r"\W", chunk):
return True
return False
*/
static int is_non_word_character(Py_UCS4 ch)
{
int ret = 0;
PyObject* modname = NULL;
PyObject* module = NULL;
PyObject* fmatch = NULL;
PyObject* pattern = NULL;
PyObject* str = NULL;
PyObject* posArgs = NULL;
PyObject* match = NULL;

modname = PyUnicode_FromString("re");
if (modname == NULL)
goto error;
module = PyImport_Import(modname);
if (module == NULL)
goto error;
fmatch = PyObject_GetAttrString(module, "fullmatch");
if (fmatch == NULL)
goto error;
pattern = PyUnicode_FromString("\\W");
if (pattern == NULL)
goto error;
str = PyUnicode_FROM_SINGLE(ch);
if (str == NULL)
goto error;
posArgs = PyTuple_Pack(2, pattern, str);
if (posArgs == NULL)
goto error;
match = PyObject_Call(fmatch, posArgs, NULL);
if (match == NULL)
goto error;

if (match != Py_None)
ret = 1;
goto end;

error:
ret = -1;
end:
Py_XDECREF(match);
Py_XDECREF(posArgs);
Py_XDECREF(str);
Py_XDECREF(pattern);
Py_XDECREF(fmatch);
Py_XDECREF(module);
Py_XDECREF(modname);
return ret;
}

/*
Parse a template at the head of the wikicode string.
*/
static int Tokenizer_parse_template(Tokenizer* self, int has_content)
@@ -576,7 +516,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
static const char* valid = URISCHEME;
Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
PyObject *scheme;
Py_UCS4 chunk;
Py_UCS4 ch;
Py_ssize_t i;
int slashes, j;
uint64_t new_context;
@@ -586,15 +526,10 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
// We have to backtrack through the textbuffer looking for our scheme since
// it was just parsed as text:
for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
chunk = Textbuffer_read(self->topstack->textbuffer, i);
// stop at the first non-word character
int is_non_word = is_non_word_character(chunk);
if (is_non_word < 0) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
else if (is_non_word == 1)
goto end_of_loop;
ch = Textbuffer_read(self->topstack->textbuffer, i);
// Stop at the first non-word character (equivalent to \W in regex)
if (!Py_UNICODE_ISALNUM(ch) && ch != '_')
break;
j = 0;
do {
if (!valid[j]) {
@@ -602,10 +537,9 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
FAIL_ROUTE(0);
return 0;
}
} while (chunk != (Py_UCS4) valid[j++]);
Textbuffer_write(scheme_buffer, chunk);
} while (ch != (Py_UCS4) valid[j++]);
Textbuffer_write(scheme_buffer, ch);
}
end_of_loop:
Textbuffer_reverse(scheme_buffer);
scheme = Textbuffer_render(scheme_buffer);
if (!scheme) {
@@ -670,17 +604,17 @@ static int Tokenizer_handle_free_link_text(
}

/*
Return whether the current head is the end of a free link.
Return whether the current head is the end of a URI.
*/
static int
Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
Tokenizer_is_uri_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
{
// Built from Tokenizer_parse()'s end sentinels:
Py_UCS4 after = Tokenizer_read(self, 2);
uint64_t ctx = self->topstack->context;

return (!this || this == '\n' || this == '[' || this == ']' ||
this == '<' || this == '>' || this == '"' ||
this == '<' || this == '>' || this == '"' || this == ' ' ||
(this == '\'' && next == '\'') ||
(this == '|' && ctx & LC_TEMPLATE) ||
(this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
@@ -723,50 +657,48 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
if (Tokenizer_parse_comment(self))
return NULL;
}
else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) {
self->head--;
return Tokenizer_pop(self);
}
else if (!this || this == '\n')
return Tokenizer_fail_route(self);
else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) {
PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_template_or_argument(self))
return NULL;
}
else if (this == ']')
return Tokenizer_pop(self);
else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) {
if (brackets) {
else if (brackets) {
if (!this || this == '\n')
return Tokenizer_fail_route(self);
if (this == ']')
return Tokenizer_pop(self);
if (Tokenizer_is_uri_end(self, this, next)) {
if (this == ' ') {
if (Tokenizer_emit(self, ExternalLinkSeparator))
return NULL;
self->head++;
}
else {
PyObject* kwargs = PyDict_New();
if (!kwargs)
return NULL;
if (this != ' ')
PyDict_SetItemString(kwargs, "suppress_space", Py_True);
PyDict_SetItemString(kwargs, "suppress_space", Py_True);
if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs))
return NULL;
}
self->topstack->context ^= LC_EXT_LINK_URI;
self->topstack->context |= LC_EXT_LINK_TITLE;
if (this == ' ')
self->head++;
return Tokenizer_parse(self, 0, 0);
}
if (Textbuffer_write(extra, this))
return NULL;
return Tokenizer_pop(self);
}
else if (!brackets) {
if (Tokenizer_handle_free_link_text(self, &parens, extra, this))
if (Tokenizer_emit_char(self, this))
return NULL;
}
else {
if (Tokenizer_emit_char(self, this))
if (Tokenizer_is_uri_end(self, this, next)) {
if (this == ' ') {
if (Textbuffer_write(extra, this))
return NULL;
}
else
self->head--;
return Tokenizer_pop(self);
}
if (Tokenizer_handle_free_link_text(self, &parens, extra, this))
return NULL;
}
self->head++;


+ 3
- 3
src/mwparserfromhell/parser/ctokenizer/tokenizer.c View File

@@ -129,10 +129,10 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{
PyObject *input, *tokens;
uint64_t context = 0;
unsigned long long context = 0;
int skip_style_tags = 0;

if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) {
if (PyArg_ParseTuple(args, "U|Kp", &input, &context, &skip_style_tags)) {
Py_INCREF(input);
if (load_tokenizer_text(&self->text, input))
return NULL;
@@ -143,7 +143,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)

/* Failed to parse a Unicode object; try a string instead. */
PyErr_Clear();
if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
if (!PyArg_ParseTuple(args, "s#|Kp", &encoded, &size, &context,
&skip_style_tags))
return NULL;
if (!(input = PyUnicode_FromStringAndSize(encoded, size)))


+ 44
- 51
src/mwparserfromhell/parser/tokenizer.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -60,8 +60,9 @@ class Tokenizer:
USES_C = False
START = object()
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", '"', "#", "*", ";",
":", "/", "-", "!", "\n", START, END]
URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
MAX_DEPTH = 40
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
tag_splitter = re.compile(r"([\s\"\'\\]+)")
@@ -323,7 +324,7 @@ class Tokenizer:
self._head += 2
try:
# If the wikilink looks like an external link, parse it as such:
link, _extra, _delta = self._really_parse_external_link(True)
link, _extra = self._really_parse_external_link(True)
except BadRoute:
self._head = reset + 1
try:
@@ -366,8 +367,7 @@ class Tokenizer:
self._emit_text("//")
self._head += 2
else:
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
all_valid = lambda: all(char in valid for char in self._read())
all_valid = lambda: all(char in self.URISCHEME for char in self._read())
scheme = ""
while self._read() is not self.END and all_valid():
scheme += self._read()
@@ -386,17 +386,16 @@ class Tokenizer:

def _parse_free_uri_scheme(self):
"""Parse the URI scheme of a free (no brackets) external link."""
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
scheme = []
try:
# We have to backtrack through the textbuffer looking for our
# scheme since it was just parsed as text:
for chunk in reversed(self._textbuffer):
for char in reversed(chunk):
# stop at the first non-word character
# Stop at the first non-word character
if re.fullmatch(r"\W", char):
raise StopIteration()
if char not in valid:
if char not in self.URISCHEME:
raise BadRoute()
scheme.append(char)
except StopIteration:
@@ -434,15 +433,15 @@ class Tokenizer:
self._emit_text(this)
return punct, tail

def _is_free_link_end(self, this, nxt):
"""Return whether the current head is the end of a free link."""
def _is_uri_end(self, this, nxt):
"""Return whether the current head is the end of a URI."""
# Built from _parse()'s end sentinels:
after, ctx = self._read(2), self._context
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or
return (this in (self.END, "\n", "[", "]", "<", ">", '"') or
" " in this or
this == nxt == "'" or
(this == "|" and ctx & contexts.TEMPLATE) or
(this == "=" and ctx & equal_sign_contexts) or
(this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING)) or
(this == nxt == "}" and ctx & contexts.TEMPLATE) or
(this == nxt == after == "}" and ctx & contexts.ARGUMENT))

@@ -451,6 +450,7 @@ class Tokenizer:
if brackets:
self._parse_bracketed_uri_scheme()
invalid = ("\n", " ", "]")
punct = ()
else:
self._parse_free_uri_scheme()
invalid = ("\n", " ", "[", "]")
@@ -465,53 +465,47 @@ class Tokenizer:
self._emit_text(tail)
tail = ""
self._parse_entity()
elif (this == "<" and nxt == "!" and self._read(2) ==
self._read(3) == "-"):
elif this == "<" and nxt == "!" and self._read(2) == self._read(3) == "-":
if tail:
self._emit_text(tail)
tail = ""
self._parse_comment()
elif not brackets and self._is_free_link_end(this, nxt):
return self._pop(), tail, -1
elif this is self.END or this == "\n":
self._fail_route()
elif this == nxt == "{" and self._can_recurse():
if tail:
self._emit_text(tail)
tail = ""
self._parse_template_or_argument()
elif this == "]":
return self._pop(), tail, 0
elif this == "'" and nxt == "'":
separator = tokens.ExternalLinkSeparator()
separator.suppress_space = True
self._emit(separator)
self._context ^= contexts.EXT_LINK_URI
self._context |= contexts.EXT_LINK_TITLE
return self._parse(push=False), None, 0
elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">",
"\"")):
before, after = re.split(r"[ \n[\]<>\"]", this, maxsplit=1)
delimiter = this[len(before)]
if brackets:
self._emit_text(before)
separator = tokens.ExternalLinkSeparator()
if delimiter != " ":
elif brackets:
if this is self.END or this == "\n":
self._fail_route()
if this == "]":
return self._pop(), None
if self._is_uri_end(this, nxt):
if " " in this:
before, after = this.split(" ", 1)
self._emit_text(before)
self._emit(tokens.ExternalLinkSeparator())
if after:
self._emit_text(after)
self._head += 1
else:
separator = tokens.ExternalLinkSeparator()
separator.suppress_space = True
self._emit(separator)
if after:
self._emit_text(after)
self._emit(separator)
self._context ^= contexts.EXT_LINK_URI
self._context |= contexts.EXT_LINK_TITLE
if delimiter == " ":
self._head += 1
return self._parse(push=False), None, 0
punct, tail = self._handle_free_link_text(punct, tail, before)
return self._pop(), tail + " " + after, 0
elif not brackets:
punct, tail = self._handle_free_link_text(punct, tail, this)
else:
return self._parse(push=False), None
self._emit_text(this)
else:
if self._is_uri_end(this, nxt):
if this is not self.END and " " in this:
before, after = this.split(" ", 1)
punct, tail = self._handle_free_link_text(punct, tail, before)
tail += " " + after
else:
self._head -= 1
return self._pop(), tail
punct, tail = self._handle_free_link_text(punct, tail, this)
self._head += 1

def _remove_uri_scheme_from_textbuffer(self, scheme):
@@ -536,7 +530,7 @@ class Tokenizer:
reset = self._head
self._head += 1
try:
link, extra, delta = self._really_parse_external_link(brackets)
link, extra = self._really_parse_external_link(brackets)
except BadRoute:
self._head = reset
if not brackets and self._context & contexts.DL_TERM:
@@ -550,7 +544,6 @@ class Tokenizer:
self._emit(tokens.ExternalLinkOpen(brackets=brackets))
self._emit_all(link)
self._emit(tokens.ExternalLinkClose())
self._head += delta
if extra:
self._emit_text(extra)

@@ -854,8 +847,8 @@ class Tokenizer:
depth -= 1
if depth == 0: # pragma: no cover (untestable/exceptional)
raise ParserError(
"_handle_single_tag_end() got an unexpected "
"TagCloseSelfclose")
"_handle_single_tag_end() got an unexpected TagCloseSelfclose"
)
else: # pragma: no cover (untestable/exceptional case)
raise ParserError("_handle_single_tag_end() missed a TagCloseOpen")
padding = stack[index].padding


Loading…
Cancel
Save