diff --git a/.gitignore b/.gitignore index 930f0bf..0a03112 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ docs/_build scripts/*.log htmlcov/ .idea/ +.pytest_cache/ diff --git a/CHANGELOG b/CHANGELOG index 6dbd975..09d14e7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,8 @@ v0.7 (unreleased): - Added Linux AArch64 wheels. (#276) +- Fixed C integer conversion, manifesting as parsing errors on big-endian + platforms. (#277) v0.6.2 (released May 16, 2021): diff --git a/docs/changelog.rst b/docs/changelog.rst index ebb2482..7fe93dc 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,9 @@ Unreleased - Added Linux AArch64 wheels. (`#276 `_) +- Fixed C integer conversion, manifesting as parsing errors on big-endian + platforms. + (`#277 `_) v0.6.2 ------ diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c index 6e9022d..740e9bf 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2019 Ben Kurtovic +Copyright (C) 2012-2021 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -100,66 +100,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) } /* - Check if the given character is a non-word character. - - Equivalent to this Python code: - - def is_non_word_character(ch): - if re.fullmatch(r"\W", chunk): - return True - return False -*/ -static int is_non_word_character(Py_UCS4 ch) -{ - int ret = 0; - PyObject* modname = NULL; - PyObject* module = NULL; - PyObject* fmatch = NULL; - PyObject* pattern = NULL; - PyObject* str = NULL; - PyObject* posArgs = NULL; - PyObject* match = NULL; - - modname = PyUnicode_FromString("re"); - if (modname == NULL) - goto error; - module = PyImport_Import(modname); - if (module == NULL) - goto error; - fmatch = PyObject_GetAttrString(module, "fullmatch"); - if (fmatch == NULL) - goto error; - pattern = PyUnicode_FromString("\\W"); - if (pattern == NULL) - goto error; - str = PyUnicode_FROM_SINGLE(ch); - if (str == NULL) - goto error; - posArgs = PyTuple_Pack(2, pattern, str); - if (posArgs == NULL) - goto error; - match = PyObject_Call(fmatch, posArgs, NULL); - if (match == NULL) - goto error; - - if (match != Py_None) - ret = 1; - goto end; - - error: - ret = -1; - end: - Py_XDECREF(match); - Py_XDECREF(posArgs); - Py_XDECREF(str); - Py_XDECREF(pattern); - Py_XDECREF(fmatch); - Py_XDECREF(module); - Py_XDECREF(modname); - return ret; -} - -/* Parse a template at the head of the wikicode string. */ static int Tokenizer_parse_template(Tokenizer* self, int has_content) @@ -576,7 +516,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) static const char* valid = URISCHEME; Textbuffer *scheme_buffer = Textbuffer_new(&self->text); PyObject *scheme; - Py_UCS4 chunk; + Py_UCS4 ch; Py_ssize_t i; int slashes, j; uint64_t new_context; @@ -586,15 +526,10 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) // We have to backtrack through the textbuffer looking for our scheme since // it was just parsed as text: for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { - chunk = Textbuffer_read(self->topstack->textbuffer, i); - // stop at the first non-word character - int is_non_word = is_non_word_character(chunk); - if (is_non_word < 0) { - Textbuffer_dealloc(scheme_buffer); - return -1; - } - else if (is_non_word == 1) - goto end_of_loop; + ch = Textbuffer_read(self->topstack->textbuffer, i); + // Stop at the first non-word character (equivalent to \W in regex) + if (!Py_UNICODE_ISALNUM(ch) && ch != '_') + break; j = 0; do { if (!valid[j]) { @@ -602,10 +537,9 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) FAIL_ROUTE(0); return 0; } - } while (chunk != (Py_UCS4) valid[j++]); - Textbuffer_write(scheme_buffer, chunk); + } while (ch != (Py_UCS4) valid[j++]); + Textbuffer_write(scheme_buffer, ch); } - end_of_loop: Textbuffer_reverse(scheme_buffer); scheme = Textbuffer_render(scheme_buffer); if (!scheme) { @@ -670,17 +604,17 @@ static int Tokenizer_handle_free_link_text( } /* - Return whether the current head is the end of a free link. + Return whether the current head is the end of a URI. */ static int -Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) +Tokenizer_is_uri_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) { // Built from Tokenizer_parse()'s end sentinels: Py_UCS4 after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || - this == '<' || this == '>' || this == '"' || + this == '<' || this == '>' || this == '"' || this == ' ' || (this == '\'' && next == '\'') || (this == '|' && ctx & LC_TEMPLATE) || (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || @@ -723,50 +657,48 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, if (Tokenizer_parse_comment(self)) return NULL; } - else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) { - self->head--; - return Tokenizer_pop(self); - } - else if (!this || this == '\n') - return Tokenizer_fail_route(self); else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) { PUSH_TAIL_BUFFER(extra, NULL) if (Tokenizer_parse_template_or_argument(self)) return NULL; } - else if (this == ']') - return Tokenizer_pop(self); - else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) { - if (brackets) { + else if (brackets) { + if (!this || this == '\n') + return Tokenizer_fail_route(self); + if (this == ']') + return Tokenizer_pop(self); + if (Tokenizer_is_uri_end(self, this, next)) { if (this == ' ') { if (Tokenizer_emit(self, ExternalLinkSeparator)) return NULL; + self->head++; } else { PyObject* kwargs = PyDict_New(); if (!kwargs) return NULL; - if (this != ' ') - PyDict_SetItemString(kwargs, "suppress_space", Py_True); + PyDict_SetItemString(kwargs, "suppress_space", Py_True); if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs)) return NULL; } self->topstack->context ^= LC_EXT_LINK_URI; self->topstack->context |= LC_EXT_LINK_TITLE; - if (this == ' ') - self->head++; return Tokenizer_parse(self, 0, 0); } - if (Textbuffer_write(extra, this)) - return NULL; - return Tokenizer_pop(self); - } - else if (!brackets) { - if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) + if (Tokenizer_emit_char(self, this)) return NULL; } else { - if (Tokenizer_emit_char(self, this)) + if (Tokenizer_is_uri_end(self, this, next)) { + if (this == ' ') { + if (Textbuffer_write(extra, this)) + return NULL; + } + else + self->head--; + return Tokenizer_pop(self); + } + if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) return NULL; } self->head++; diff --git a/src/mwparserfromhell/parser/ctokenizer/tokenizer.c b/src/mwparserfromhell/parser/ctokenizer/tokenizer.c index a501032..066f527 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tokenizer.c +++ b/src/mwparserfromhell/parser/ctokenizer/tokenizer.c @@ -129,10 +129,10 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *input, *tokens; - uint64_t context = 0; + unsigned long long context = 0; int skip_style_tags = 0; - if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) { + if (PyArg_ParseTuple(args, "U|Kp", &input, &context, &skip_style_tags)) { Py_INCREF(input); if (load_tokenizer_text(&self->text, input)) return NULL; @@ -143,7 +143,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) /* Failed to parse a Unicode object; try a string instead. */ PyErr_Clear(); - if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context, + if (!PyArg_ParseTuple(args, "s#|Kp", &encoded, &size, &context, &skip_style_tags)) return NULL; if (!(input = PyUnicode_FromStringAndSize(encoded, size))) diff --git a/src/mwparserfromhell/parser/tokenizer.py b/src/mwparserfromhell/parser/tokenizer.py index 76efd9b..efac02c 100644 --- a/src/mwparserfromhell/parser/tokenizer.py +++ b/src/mwparserfromhell/parser/tokenizer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2012-2020 Ben Kurtovic +# Copyright (C) 2012-2021 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -60,8 +60,9 @@ class Tokenizer: USES_C = False START = object() END = object() - MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", + MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", '"', "#", "*", ";", ":", "/", "-", "!", "\n", START, END] + URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" MAX_DEPTH = 40 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) tag_splitter = re.compile(r"([\s\"\'\\]+)") @@ -323,7 +324,7 @@ class Tokenizer: self._head += 2 try: # If the wikilink looks like an external link, parse it as such: - link, _extra, _delta = self._really_parse_external_link(True) + link, _extra = self._really_parse_external_link(True) except BadRoute: self._head = reset + 1 try: @@ -366,8 +367,7 @@ class Tokenizer: self._emit_text("//") self._head += 2 else: - valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" - all_valid = lambda: all(char in valid for char in self._read()) + all_valid = lambda: all(char in self.URISCHEME for char in self._read()) scheme = "" while self._read() is not self.END and all_valid(): scheme += self._read() @@ -386,17 +386,16 @@ class Tokenizer: def _parse_free_uri_scheme(self): """Parse the URI scheme of a free (no brackets) external link.""" - valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" scheme = [] try: # We have to backtrack through the textbuffer looking for our # scheme since it was just parsed as text: for chunk in reversed(self._textbuffer): for char in reversed(chunk): - # stop at the first non-word character + # Stop at the first non-word character if re.fullmatch(r"\W", char): raise StopIteration() - if char not in valid: + if char not in self.URISCHEME: raise BadRoute() scheme.append(char) except StopIteration: @@ -434,15 +433,15 @@ class Tokenizer: self._emit_text(this) return punct, tail - def _is_free_link_end(self, this, nxt): - """Return whether the current head is the end of a free link.""" + def _is_uri_end(self, this, nxt): + """Return whether the current head is the end of a URI.""" # Built from _parse()'s end sentinels: after, ctx = self._read(2), self._context - equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING - return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or + return (this in (self.END, "\n", "[", "]", "<", ">", '"') or + " " in this or this == nxt == "'" or (this == "|" and ctx & contexts.TEMPLATE) or - (this == "=" and ctx & equal_sign_contexts) or + (this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING)) or (this == nxt == "}" and ctx & contexts.TEMPLATE) or (this == nxt == after == "}" and ctx & contexts.ARGUMENT)) @@ -451,6 +450,7 @@ class Tokenizer: if brackets: self._parse_bracketed_uri_scheme() invalid = ("\n", " ", "]") + punct = () else: self._parse_free_uri_scheme() invalid = ("\n", " ", "[", "]") @@ -465,53 +465,47 @@ class Tokenizer: self._emit_text(tail) tail = "" self._parse_entity() - elif (this == "<" and nxt == "!" and self._read(2) == - self._read(3) == "-"): + elif this == "<" and nxt == "!" and self._read(2) == self._read(3) == "-": if tail: self._emit_text(tail) tail = "" self._parse_comment() - elif not brackets and self._is_free_link_end(this, nxt): - return self._pop(), tail, -1 - elif this is self.END or this == "\n": - self._fail_route() elif this == nxt == "{" and self._can_recurse(): if tail: self._emit_text(tail) tail = "" self._parse_template_or_argument() - elif this == "]": - return self._pop(), tail, 0 - elif this == "'" and nxt == "'": - separator = tokens.ExternalLinkSeparator() - separator.suppress_space = True - self._emit(separator) - self._context ^= contexts.EXT_LINK_URI - self._context |= contexts.EXT_LINK_TITLE - return self._parse(push=False), None, 0 - elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">", - "\"")): - before, after = re.split(r"[ \n[\]<>\"]", this, maxsplit=1) - delimiter = this[len(before)] - if brackets: - self._emit_text(before) - separator = tokens.ExternalLinkSeparator() - if delimiter != " ": + elif brackets: + if this is self.END or this == "\n": + self._fail_route() + if this == "]": + return self._pop(), None + if self._is_uri_end(this, nxt): + if " " in this: + before, after = this.split(" ", 1) + self._emit_text(before) + self._emit(tokens.ExternalLinkSeparator()) + if after: + self._emit_text(after) + self._head += 1 + else: + separator = tokens.ExternalLinkSeparator() separator.suppress_space = True - self._emit(separator) - if after: - self._emit_text(after) + self._emit(separator) self._context ^= contexts.EXT_LINK_URI self._context |= contexts.EXT_LINK_TITLE - if delimiter == " ": - self._head += 1 - return self._parse(push=False), None, 0 - punct, tail = self._handle_free_link_text(punct, tail, before) - return self._pop(), tail + " " + after, 0 - elif not brackets: - punct, tail = self._handle_free_link_text(punct, tail, this) - else: + return self._parse(push=False), None self._emit_text(this) + else: + if self._is_uri_end(this, nxt): + if this is not self.END and " " in this: + before, after = this.split(" ", 1) + punct, tail = self._handle_free_link_text(punct, tail, before) + tail += " " + after + else: + self._head -= 1 + return self._pop(), tail + punct, tail = self._handle_free_link_text(punct, tail, this) self._head += 1 def _remove_uri_scheme_from_textbuffer(self, scheme): @@ -536,7 +530,7 @@ class Tokenizer: reset = self._head self._head += 1 try: - link, extra, delta = self._really_parse_external_link(brackets) + link, extra = self._really_parse_external_link(brackets) except BadRoute: self._head = reset if not brackets and self._context & contexts.DL_TERM: @@ -550,7 +544,6 @@ class Tokenizer: self._emit(tokens.ExternalLinkOpen(brackets=brackets)) self._emit_all(link) self._emit(tokens.ExternalLinkClose()) - self._head += delta if extra: self._emit_text(extra) @@ -854,8 +847,8 @@ class Tokenizer: depth -= 1 if depth == 0: # pragma: no cover (untestable/exceptional) raise ParserError( - "_handle_single_tag_end() got an unexpected " - "TagCloseSelfclose") + "_handle_single_tag_end() got an unexpected TagCloseSelfclose" + ) else: # pragma: no cover (untestable/exceptional case) raise ParserError("_handle_single_tag_end() missed a TagCloseOpen") padding = stack[index].padding