@@ -14,3 +14,4 @@ docs/_build | |||
scripts/*.log | |||
htmlcov/ | |||
.idea/ | |||
.pytest_cache/ |
@@ -1,6 +1,8 @@ | |||
v0.7 (unreleased): | |||
- Added Linux AArch64 wheels. (#276) | |||
- Fixed C integer conversion, manifesting as parsing errors on big-endian | |||
platforms. (#277) | |||
v0.6.2 (released May 16, 2021): | |||
@@ -9,6 +9,9 @@ Unreleased | |||
- Added Linux AArch64 wheels. | |||
(`#276 <https://github.com/earwig/mwparserfromhell/issues/276>`_) | |||
- Fixed C integer conversion, manifesting as parsing errors on big-endian | |||
platforms. | |||
(`#277 <https://github.com/earwig/mwparserfromhell/issues/277>`_) | |||
v0.6.2 | |||
------ | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2021 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -100,66 +100,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) | |||
} | |||
/* | |||
Check if the given character is a non-word character. | |||
Equivalent to this Python code: | |||
def is_non_word_character(ch): | |||
if re.fullmatch(r"\W", chunk): | |||
return True | |||
return False | |||
*/ | |||
static int is_non_word_character(Py_UCS4 ch) | |||
{ | |||
int ret = 0; | |||
PyObject* modname = NULL; | |||
PyObject* module = NULL; | |||
PyObject* fmatch = NULL; | |||
PyObject* pattern = NULL; | |||
PyObject* str = NULL; | |||
PyObject* posArgs = NULL; | |||
PyObject* match = NULL; | |||
modname = PyUnicode_FromString("re"); | |||
if (modname == NULL) | |||
goto error; | |||
module = PyImport_Import(modname); | |||
if (module == NULL) | |||
goto error; | |||
fmatch = PyObject_GetAttrString(module, "fullmatch"); | |||
if (fmatch == NULL) | |||
goto error; | |||
pattern = PyUnicode_FromString("\\W"); | |||
if (pattern == NULL) | |||
goto error; | |||
str = PyUnicode_FROM_SINGLE(ch); | |||
if (str == NULL) | |||
goto error; | |||
posArgs = PyTuple_Pack(2, pattern, str); | |||
if (posArgs == NULL) | |||
goto error; | |||
match = PyObject_Call(fmatch, posArgs, NULL); | |||
if (match == NULL) | |||
goto error; | |||
if (match != Py_None) | |||
ret = 1; | |||
goto end; | |||
error: | |||
ret = -1; | |||
end: | |||
Py_XDECREF(match); | |||
Py_XDECREF(posArgs); | |||
Py_XDECREF(str); | |||
Py_XDECREF(pattern); | |||
Py_XDECREF(fmatch); | |||
Py_XDECREF(module); | |||
Py_XDECREF(modname); | |||
return ret; | |||
} | |||
/* | |||
Parse a template at the head of the wikicode string. | |||
*/ | |||
static int Tokenizer_parse_template(Tokenizer* self, int has_content) | |||
@@ -576,7 +516,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
static const char* valid = URISCHEME; | |||
Textbuffer *scheme_buffer = Textbuffer_new(&self->text); | |||
PyObject *scheme; | |||
Py_UCS4 chunk; | |||
Py_UCS4 ch; | |||
Py_ssize_t i; | |||
int slashes, j; | |||
uint64_t new_context; | |||
@@ -586,15 +526,10 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
// We have to backtrack through the textbuffer looking for our scheme since | |||
// it was just parsed as text: | |||
for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { | |||
chunk = Textbuffer_read(self->topstack->textbuffer, i); | |||
// stop at the first non-word character | |||
int is_non_word = is_non_word_character(chunk); | |||
if (is_non_word < 0) { | |||
Textbuffer_dealloc(scheme_buffer); | |||
return -1; | |||
} | |||
else if (is_non_word == 1) | |||
goto end_of_loop; | |||
ch = Textbuffer_read(self->topstack->textbuffer, i); | |||
// Stop at the first non-word character (equivalent to \W in regex) | |||
if (!Py_UNICODE_ISALNUM(ch) && ch != '_') | |||
break; | |||
j = 0; | |||
do { | |||
if (!valid[j]) { | |||
@@ -602,10 +537,9 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
FAIL_ROUTE(0); | |||
return 0; | |||
} | |||
} while (chunk != (Py_UCS4) valid[j++]); | |||
Textbuffer_write(scheme_buffer, chunk); | |||
} while (ch != (Py_UCS4) valid[j++]); | |||
Textbuffer_write(scheme_buffer, ch); | |||
} | |||
end_of_loop: | |||
Textbuffer_reverse(scheme_buffer); | |||
scheme = Textbuffer_render(scheme_buffer); | |||
if (!scheme) { | |||
@@ -670,17 +604,17 @@ static int Tokenizer_handle_free_link_text( | |||
} | |||
/* | |||
Return whether the current head is the end of a free link. | |||
Return whether the current head is the end of a URI. | |||
*/ | |||
static int | |||
Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||
Tokenizer_is_uri_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||
{ | |||
// Built from Tokenizer_parse()'s end sentinels: | |||
Py_UCS4 after = Tokenizer_read(self, 2); | |||
uint64_t ctx = self->topstack->context; | |||
return (!this || this == '\n' || this == '[' || this == ']' || | |||
this == '<' || this == '>' || this == '"' || | |||
this == '<' || this == '>' || this == '"' || this == ' ' || | |||
(this == '\'' && next == '\'') || | |||
(this == '|' && ctx & LC_TEMPLATE) || | |||
(this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | |||
@@ -723,50 +657,48 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||
if (Tokenizer_parse_comment(self)) | |||
return NULL; | |||
} | |||
else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) { | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
else if (!this || this == '\n') | |||
return Tokenizer_fail_route(self); | |||
else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) { | |||
PUSH_TAIL_BUFFER(extra, NULL) | |||
if (Tokenizer_parse_template_or_argument(self)) | |||
return NULL; | |||
} | |||
else if (this == ']') | |||
return Tokenizer_pop(self); | |||
else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) { | |||
if (brackets) { | |||
else if (brackets) { | |||
if (!this || this == '\n') | |||
return Tokenizer_fail_route(self); | |||
if (this == ']') | |||
return Tokenizer_pop(self); | |||
if (Tokenizer_is_uri_end(self, this, next)) { | |||
if (this == ' ') { | |||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||
return NULL; | |||
self->head++; | |||
} | |||
else { | |||
PyObject* kwargs = PyDict_New(); | |||
if (!kwargs) | |||
return NULL; | |||
if (this != ' ') | |||
PyDict_SetItemString(kwargs, "suppress_space", Py_True); | |||
PyDict_SetItemString(kwargs, "suppress_space", Py_True); | |||
if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs)) | |||
return NULL; | |||
} | |||
self->topstack->context ^= LC_EXT_LINK_URI; | |||
self->topstack->context |= LC_EXT_LINK_TITLE; | |||
if (this == ' ') | |||
self->head++; | |||
return Tokenizer_parse(self, 0, 0); | |||
} | |||
if (Textbuffer_write(extra, this)) | |||
return NULL; | |||
return Tokenizer_pop(self); | |||
} | |||
else if (!brackets) { | |||
if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) | |||
if (Tokenizer_emit_char(self, this)) | |||
return NULL; | |||
} | |||
else { | |||
if (Tokenizer_emit_char(self, this)) | |||
if (Tokenizer_is_uri_end(self, this, next)) { | |||
if (this == ' ') { | |||
if (Textbuffer_write(extra, this)) | |||
return NULL; | |||
} | |||
else | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) | |||
return NULL; | |||
} | |||
self->head++; | |||
@@ -129,10 +129,10 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input) | |||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
{ | |||
PyObject *input, *tokens; | |||
uint64_t context = 0; | |||
unsigned long long context = 0; | |||
int skip_style_tags = 0; | |||
if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) { | |||
if (PyArg_ParseTuple(args, "U|Kp", &input, &context, &skip_style_tags)) { | |||
Py_INCREF(input); | |||
if (load_tokenizer_text(&self->text, input)) | |||
return NULL; | |||
@@ -143,7 +143,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
/* Failed to parse a Unicode object; try a string instead. */ | |||
PyErr_Clear(); | |||
if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context, | |||
if (!PyArg_ParseTuple(args, "s#|Kp", &encoded, &size, &context, | |||
&skip_style_tags)) | |||
return NULL; | |||
if (!(input = PyUnicode_FromStringAndSize(encoded, size))) | |||
@@ -1,4 +1,4 @@ | |||
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2021 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -60,8 +60,9 @@ class Tokenizer: | |||
USES_C = False | |||
START = object() | |||
END = object() | |||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", | |||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", '"', "#", "*", ";", | |||
":", "/", "-", "!", "\n", START, END] | |||
URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
MAX_DEPTH = 40 | |||
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | |||
tag_splitter = re.compile(r"([\s\"\'\\]+)") | |||
@@ -323,7 +324,7 @@ class Tokenizer: | |||
self._head += 2 | |||
try: | |||
# If the wikilink looks like an external link, parse it as such: | |||
link, _extra, _delta = self._really_parse_external_link(True) | |||
link, _extra = self._really_parse_external_link(True) | |||
except BadRoute: | |||
self._head = reset + 1 | |||
try: | |||
@@ -366,8 +367,7 @@ class Tokenizer: | |||
self._emit_text("//") | |||
self._head += 2 | |||
else: | |||
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
all_valid = lambda: all(char in valid for char in self._read()) | |||
all_valid = lambda: all(char in self.URISCHEME for char in self._read()) | |||
scheme = "" | |||
while self._read() is not self.END and all_valid(): | |||
scheme += self._read() | |||
@@ -386,17 +386,16 @@ class Tokenizer: | |||
def _parse_free_uri_scheme(self): | |||
"""Parse the URI scheme of a free (no brackets) external link.""" | |||
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
scheme = [] | |||
try: | |||
# We have to backtrack through the textbuffer looking for our | |||
# scheme since it was just parsed as text: | |||
for chunk in reversed(self._textbuffer): | |||
for char in reversed(chunk): | |||
# stop at the first non-word character | |||
# Stop at the first non-word character | |||
if re.fullmatch(r"\W", char): | |||
raise StopIteration() | |||
if char not in valid: | |||
if char not in self.URISCHEME: | |||
raise BadRoute() | |||
scheme.append(char) | |||
except StopIteration: | |||
@@ -434,15 +433,15 @@ class Tokenizer: | |||
self._emit_text(this) | |||
return punct, tail | |||
def _is_free_link_end(self, this, nxt): | |||
"""Return whether the current head is the end of a free link.""" | |||
def _is_uri_end(self, this, nxt): | |||
"""Return whether the current head is the end of a URI.""" | |||
# Built from _parse()'s end sentinels: | |||
after, ctx = self._read(2), self._context | |||
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | |||
return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or | |||
return (this in (self.END, "\n", "[", "]", "<", ">", '"') or | |||
" " in this or | |||
this == nxt == "'" or | |||
(this == "|" and ctx & contexts.TEMPLATE) or | |||
(this == "=" and ctx & equal_sign_contexts) or | |||
(this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING)) or | |||
(this == nxt == "}" and ctx & contexts.TEMPLATE) or | |||
(this == nxt == after == "}" and ctx & contexts.ARGUMENT)) | |||
@@ -451,6 +450,7 @@ class Tokenizer: | |||
if brackets: | |||
self._parse_bracketed_uri_scheme() | |||
invalid = ("\n", " ", "]") | |||
punct = () | |||
else: | |||
self._parse_free_uri_scheme() | |||
invalid = ("\n", " ", "[", "]") | |||
@@ -465,53 +465,47 @@ class Tokenizer: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_entity() | |||
elif (this == "<" and nxt == "!" and self._read(2) == | |||
self._read(3) == "-"): | |||
elif this == "<" and nxt == "!" and self._read(2) == self._read(3) == "-": | |||
if tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_comment() | |||
elif not brackets and self._is_free_link_end(this, nxt): | |||
return self._pop(), tail, -1 | |||
elif this is self.END or this == "\n": | |||
self._fail_route() | |||
elif this == nxt == "{" and self._can_recurse(): | |||
if tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_template_or_argument() | |||
elif this == "]": | |||
return self._pop(), tail, 0 | |||
elif this == "'" and nxt == "'": | |||
separator = tokens.ExternalLinkSeparator() | |||
separator.suppress_space = True | |||
self._emit(separator) | |||
self._context ^= contexts.EXT_LINK_URI | |||
self._context |= contexts.EXT_LINK_TITLE | |||
return self._parse(push=False), None, 0 | |||
elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">", | |||
"\"")): | |||
before, after = re.split(r"[ \n[\]<>\"]", this, maxsplit=1) | |||
delimiter = this[len(before)] | |||
if brackets: | |||
self._emit_text(before) | |||
separator = tokens.ExternalLinkSeparator() | |||
if delimiter != " ": | |||
elif brackets: | |||
if this is self.END or this == "\n": | |||
self._fail_route() | |||
if this == "]": | |||
return self._pop(), None | |||
if self._is_uri_end(this, nxt): | |||
if " " in this: | |||
before, after = this.split(" ", 1) | |||
self._emit_text(before) | |||
self._emit(tokens.ExternalLinkSeparator()) | |||
if after: | |||
self._emit_text(after) | |||
self._head += 1 | |||
else: | |||
separator = tokens.ExternalLinkSeparator() | |||
separator.suppress_space = True | |||
self._emit(separator) | |||
if after: | |||
self._emit_text(after) | |||
self._emit(separator) | |||
self._context ^= contexts.EXT_LINK_URI | |||
self._context |= contexts.EXT_LINK_TITLE | |||
if delimiter == " ": | |||
self._head += 1 | |||
return self._parse(push=False), None, 0 | |||
punct, tail = self._handle_free_link_text(punct, tail, before) | |||
return self._pop(), tail + " " + after, 0 | |||
elif not brackets: | |||
punct, tail = self._handle_free_link_text(punct, tail, this) | |||
else: | |||
return self._parse(push=False), None | |||
self._emit_text(this) | |||
else: | |||
if self._is_uri_end(this, nxt): | |||
if this is not self.END and " " in this: | |||
before, after = this.split(" ", 1) | |||
punct, tail = self._handle_free_link_text(punct, tail, before) | |||
tail += " " + after | |||
else: | |||
self._head -= 1 | |||
return self._pop(), tail | |||
punct, tail = self._handle_free_link_text(punct, tail, this) | |||
self._head += 1 | |||
def _remove_uri_scheme_from_textbuffer(self, scheme): | |||
@@ -536,7 +530,7 @@ class Tokenizer: | |||
reset = self._head | |||
self._head += 1 | |||
try: | |||
link, extra, delta = self._really_parse_external_link(brackets) | |||
link, extra = self._really_parse_external_link(brackets) | |||
except BadRoute: | |||
self._head = reset | |||
if not brackets and self._context & contexts.DL_TERM: | |||
@@ -550,7 +544,6 @@ class Tokenizer: | |||
self._emit(tokens.ExternalLinkOpen(brackets=brackets)) | |||
self._emit_all(link) | |||
self._emit(tokens.ExternalLinkClose()) | |||
self._head += delta | |||
if extra: | |||
self._emit_text(extra) | |||
@@ -854,8 +847,8 @@ class Tokenizer: | |||
depth -= 1 | |||
if depth == 0: # pragma: no cover (untestable/exceptional) | |||
raise ParserError( | |||
"_handle_single_tag_end() got an unexpected " | |||
"TagCloseSelfclose") | |||
"_handle_single_tag_end() got an unexpected TagCloseSelfclose" | |||
) | |||
else: # pragma: no cover (untestable/exceptional case) | |||
raise ParserError("_handle_single_tag_end() missed a TagCloseOpen") | |||
padding = stack[index].padding | |||