Clean up external links parsing logic and fix integer conversion

3 years ago · 911d7e5f88
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ docs/_build
 scripts/*.log
 htmlcov/
 .idea/
 .pytest_cache/
--- a/+ 2
+++ b/+ 2
@@ -1,6 +1,8 @@
 v0.7 (unreleased):
 - Added Linux AArch64 wheels. (#276)
 - Fixed C integer conversion, manifesting as parsing errors on big-endian
  platforms. (#277)
 v0.6.2 (released May 16, 2021):
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -9,6 +9,9 @@ Unreleased
 - Added Linux AArch64 wheels.
  (`#276 <https://github.com/earwig/mwparserfromhell/issues/276>`_)
 - Fixed C integer conversion, manifesting as parsing errors on big-endian
  platforms.
  (`#277 <https://github.com/earwig/mwparserfromhell/issues/277>`_)
 v0.6.2
 ------
--- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -1,5 +1,5 @@
 /*
 Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -100,66 +100,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
 }
 /*
    Check if the given character is a non-word character.
    Equivalent to this Python code:
    def is_non_word_character(ch):
        if re.fullmatch(r"\W", chunk):
            return True
        return False
 */
 static int is_non_word_character(Py_UCS4 ch)
 {
    int ret = 0;
    PyObject* modname = NULL;
    PyObject* module = NULL;
    PyObject* fmatch = NULL;
    PyObject* pattern = NULL;
    PyObject* str = NULL;
    PyObject* posArgs = NULL;
    PyObject* match = NULL;
    modname = PyUnicode_FromString("re");
    if (modname == NULL)
        goto error;
    module = PyImport_Import(modname);
    if (module == NULL)
        goto error;
    fmatch = PyObject_GetAttrString(module, "fullmatch");
    if (fmatch == NULL)
        goto error;
    pattern = PyUnicode_FromString("\\W");
    if (pattern == NULL)
        goto error;
    str = PyUnicode_FROM_SINGLE(ch);
    if (str == NULL)
        goto error;
    posArgs = PyTuple_Pack(2, pattern, str);
    if (posArgs == NULL)
        goto error;
    match = PyObject_Call(fmatch, posArgs, NULL);
    if (match == NULL)
        goto error;
    if (match != Py_None)
        ret = 1;
    goto end;
    error:
    ret = -1;
    end:
    Py_XDECREF(match);
    Py_XDECREF(posArgs);
    Py_XDECREF(str);
    Py_XDECREF(pattern);
    Py_XDECREF(fmatch);
    Py_XDECREF(module);
    Py_XDECREF(modname);
    return ret;
 }
 /*
    Parse a template at the head of the wikicode string.
 */
 static int Tokenizer_parse_template(Tokenizer* self, int has_content)
@@ -576,7 +516,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
    static const char* valid = URISCHEME;
    Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
    PyObject *scheme;
    Py_UCS4 chunk;
    Py_UCS4 ch;
    Py_ssize_t i;
    int slashes, j;
    uint64_t new_context;
@@ -586,15 +526,10 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
    // We have to backtrack through the textbuffer looking for our scheme since
    // it was just parsed as text:
    for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
        chunk = Textbuffer_read(self->topstack->textbuffer, i);
        // stop at the first non-word character
        int is_non_word = is_non_word_character(chunk);
        if (is_non_word < 0) {
            Textbuffer_dealloc(scheme_buffer);
            return -1;
        }
        else if (is_non_word == 1)
            goto end_of_loop;
        ch = Textbuffer_read(self->topstack->textbuffer, i);
        // Stop at the first non-word character (equivalent to \W in regex)
        if (!Py_UNICODE_ISALNUM(ch) && ch != '_')
            break;
        j = 0;
        do {
            if (!valid[j]) {
@@ -602,10 +537,9 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
                FAIL_ROUTE(0);
                return 0;
            }
        } while (chunk != (Py_UCS4) valid[j++]);
        Textbuffer_write(scheme_buffer, chunk);
        } while (ch != (Py_UCS4) valid[j++]);
        Textbuffer_write(scheme_buffer, ch);
    }
    end_of_loop:
    Textbuffer_reverse(scheme_buffer);
    scheme = Textbuffer_render(scheme_buffer);
    if (!scheme) {
@@ -670,17 +604,17 @@ static int Tokenizer_handle_free_link_text(
 }
 /*
    Return whether the current head is the end of a free link.
    Return whether the current head is the end of a URI.
 */
 static int
 Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
 Tokenizer_is_uri_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
 {
    // Built from Tokenizer_parse()'s end sentinels:
    Py_UCS4 after = Tokenizer_read(self, 2);
    uint64_t ctx = self->topstack->context;
    return (!this || this == '\n' || this == '[' || this == ']' ||
        this == '<' || this == '>' || this == '"' ||
        this == '<' || this == '>' || this == '"' || this == ' ' ||
        (this == '\'' && next == '\'') ||
        (this == '|' && ctx & LC_TEMPLATE) ||
        (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
@@ -723,50 +657,48 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
            if (Tokenizer_parse_comment(self))
                return NULL;
        }
        else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) {
            self->head--;
            return Tokenizer_pop(self);
        }
        else if (!this || this == '\n')
            return Tokenizer_fail_route(self);
        else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) {
            PUSH_TAIL_BUFFER(extra, NULL)
            if (Tokenizer_parse_template_or_argument(self))
                return NULL;
        }
        else if (this == ']')
            return Tokenizer_pop(self);
        else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) {
            if (brackets) {
        else if (brackets) {
            if (!this || this == '\n')
                return Tokenizer_fail_route(self);
            if (this == ']')
                return Tokenizer_pop(self);
            if (Tokenizer_is_uri_end(self, this, next)) {
                if (this == ' ') {
                    if (Tokenizer_emit(self, ExternalLinkSeparator))
                        return NULL;
                    self->head++;
                }
                else {
                    PyObject* kwargs = PyDict_New();
                    if (!kwargs)
                        return NULL;
                    if (this != ' ')
                        PyDict_SetItemString(kwargs, "suppress_space", Py_True);
                    PyDict_SetItemString(kwargs, "suppress_space", Py_True);
                    if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs))
                        return NULL;
                }
                self->topstack->context ^= LC_EXT_LINK_URI;
                self->topstack->context |= LC_EXT_LINK_TITLE;
                if (this == ' ')
                    self->head++;
                return Tokenizer_parse(self, 0, 0);
            }
            if (Textbuffer_write(extra, this))
                return NULL;
            return Tokenizer_pop(self);
        }
        else if (!brackets) {
            if (Tokenizer_handle_free_link_text(self, &parens, extra, this))
            if (Tokenizer_emit_char(self, this))
                return NULL;
        }
        else {
            if (Tokenizer_emit_char(self, this))
            if (Tokenizer_is_uri_end(self, this, next)) {
                if (this == ' ') {
                    if (Textbuffer_write(extra, this))
                        return NULL;
                }
                else
                    self->head--;
                return Tokenizer_pop(self);
            }
            if (Tokenizer_handle_free_link_text(self, &parens, extra, this))
                return NULL;
        }
        self->head++;
--- a/src/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/src/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -129,10 +129,10 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
 static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
 {
    PyObject *input, *tokens;
    uint64_t context = 0;
    unsigned long long context = 0;
    int skip_style_tags = 0;
    if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) {
    if (PyArg_ParseTuple(args, "U|Kp", &input, &context, &skip_style_tags)) {
        Py_INCREF(input);
        if (load_tokenizer_text(&self->text, input))
            return NULL;
@@ -143,7 +143,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
        /* Failed to parse a Unicode object; try a string instead. */
        PyErr_Clear();
        if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
        if (!PyArg_ParseTuple(args, "s#|Kp", &encoded, &size, &context,
                              &skip_style_tags))
            return NULL;
        if (!(input = PyUnicode_FromStringAndSize(encoded, size)))
--- a/src/mwparserfromhell/parser/tokenizer.py
+++ b/src/mwparserfromhell/parser/tokenizer.py
@@ -1,4 +1,4 @@
 # Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2012-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -60,8 +60,9 @@ class Tokenizer:
    USES_C = False
    START = object()
    END = object()
    MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
    MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", '"', "#", "*", ";",
               ":", "/", "-", "!", "\n", START, END]
    URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
    MAX_DEPTH = 40
    regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
    tag_splitter = re.compile(r"([\s\"\'\\]+)")
@@ -323,7 +324,7 @@ class Tokenizer:
        self._head += 2
        try:
            # If the wikilink looks like an external link, parse it as such:
            link, _extra, _delta = self._really_parse_external_link(True)
            link, _extra = self._really_parse_external_link(True)
        except BadRoute:
            self._head = reset + 1
            try:
@@ -366,8 +367,7 @@ class Tokenizer:
            self._emit_text("//")
            self._head += 2
        else:
            valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
            all_valid = lambda: all(char in valid for char in self._read())
            all_valid = lambda: all(char in self.URISCHEME for char in self._read())
            scheme = ""
            while self._read() is not self.END and all_valid():
                scheme += self._read()
@@ -386,17 +386,16 @@ class Tokenizer:
    def _parse_free_uri_scheme(self):
        """Parse the URI scheme of a free (no brackets) external link."""
        valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
        scheme = []
        try:
            # We have to backtrack through the textbuffer looking for our
            # scheme since it was just parsed as text:
            for chunk in reversed(self._textbuffer):
                for char in reversed(chunk):
                    # stop at the first non-word character
                    # Stop at the first non-word character
                    if re.fullmatch(r"\W", char):
                        raise StopIteration()
                    if char not in valid:
                    if char not in self.URISCHEME:
                        raise BadRoute()
                    scheme.append(char)
        except StopIteration:
@@ -434,15 +433,15 @@ class Tokenizer:
        self._emit_text(this)
        return punct, tail
    def _is_free_link_end(self, this, nxt):
        """Return whether the current head is the end of a free link."""
    def _is_uri_end(self, this, nxt):
        """Return whether the current head is the end of a URI."""
        # Built from _parse()'s end sentinels:
        after, ctx = self._read(2), self._context
        equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
        return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or
        return (this in (self.END, "\n", "[", "]", "<", ">", '"') or
                " " in this or
                this == nxt == "'" or
                (this == "|" and ctx & contexts.TEMPLATE) or
                (this == "=" and ctx & equal_sign_contexts) or
                (this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING)) or
                (this == nxt == "}" and ctx & contexts.TEMPLATE) or
                (this == nxt == after == "}" and ctx & contexts.ARGUMENT))
@@ -451,6 +450,7 @@ class Tokenizer:
        if brackets:
            self._parse_bracketed_uri_scheme()
            invalid = ("\n", " ", "]")
            punct = ()
        else:
            self._parse_free_uri_scheme()
            invalid = ("\n", " ", "[", "]")
@@ -465,53 +465,47 @@ class Tokenizer:
                    self._emit_text(tail)
                    tail = ""
                self._parse_entity()
            elif (this == "<" and nxt == "!" and self._read(2) ==
                  self._read(3) == "-"):
            elif this == "<" and nxt == "!" and self._read(2) == self._read(3) == "-":
                if tail:
                    self._emit_text(tail)
                    tail = ""
                self._parse_comment()
            elif not brackets and self._is_free_link_end(this, nxt):
                return self._pop(), tail, -1
            elif this is self.END or this == "\n":
                self._fail_route()
            elif this == nxt == "{" and self._can_recurse():
                if tail:
                    self._emit_text(tail)
                    tail = ""
                self._parse_template_or_argument()
            elif this == "]":
                return self._pop(), tail, 0
            elif this == "'" and nxt == "'":
                separator = tokens.ExternalLinkSeparator()
                separator.suppress_space = True
                self._emit(separator)
                self._context ^= contexts.EXT_LINK_URI
                self._context |= contexts.EXT_LINK_TITLE
                return self._parse(push=False), None, 0
            elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">",
                                           "\"")):
                before, after = re.split(r"[ \n[\]<>\"]", this, maxsplit=1)
                delimiter = this[len(before)]
                if brackets:
                    self._emit_text(before)
                    separator = tokens.ExternalLinkSeparator()
                    if delimiter != " ":
            elif brackets:
                if this is self.END or this == "\n":
                    self._fail_route()
                if this == "]":
                    return self._pop(), None
                if self._is_uri_end(this, nxt):
                    if " " in this:
                        before, after = this.split(" ", 1)
                        self._emit_text(before)
                        self._emit(tokens.ExternalLinkSeparator())
                        if after:
                            self._emit_text(after)
                        self._head += 1
                    else:
                        separator = tokens.ExternalLinkSeparator()
                        separator.suppress_space = True
                    self._emit(separator)
                    if after:
                        self._emit_text(after)
                        self._emit(separator)
                    self._context ^= contexts.EXT_LINK_URI
                    self._context |= contexts.EXT_LINK_TITLE
                    if delimiter == " ":
                        self._head += 1
                    return self._parse(push=False), None, 0
                punct, tail = self._handle_free_link_text(punct, tail, before)
                return self._pop(), tail + " " + after, 0
            elif not brackets:
                punct, tail = self._handle_free_link_text(punct, tail, this)
            else:
                    return self._parse(push=False), None
                self._emit_text(this)
            else:
                if self._is_uri_end(this, nxt):
                    if this is not self.END and " " in this:
                        before, after = this.split(" ", 1)
                        punct, tail = self._handle_free_link_text(punct, tail, before)
                        tail += " " + after
                    else:
                        self._head -= 1
                    return self._pop(), tail
                punct, tail = self._handle_free_link_text(punct, tail, this)
            self._head += 1
    def _remove_uri_scheme_from_textbuffer(self, scheme):
@@ -536,7 +530,7 @@ class Tokenizer:
        reset = self._head
        self._head += 1
        try:
            link, extra, delta = self._really_parse_external_link(brackets)
            link, extra = self._really_parse_external_link(brackets)
        except BadRoute:
            self._head = reset
            if not brackets and self._context & contexts.DL_TERM:
@@ -550,7 +544,6 @@ class Tokenizer:
            self._emit(tokens.ExternalLinkOpen(brackets=brackets))
            self._emit_all(link)
            self._emit(tokens.ExternalLinkClose())
            self._head += delta
            if extra:
                self._emit_text(extra)
@@ -854,8 +847,8 @@ class Tokenizer:
                depth -= 1
                if depth == 0:  # pragma: no cover (untestable/exceptional)
                    raise ParserError(
                        "_handle_single_tag_end() got an unexpected "
                        "TagCloseSelfclose")
                        "_handle_single_tag_end() got an unexpected TagCloseSelfclose"
                    )
        else:  # pragma: no cover (untestable/exceptional case)
            raise ParserError("_handle_single_tag_end() missed a TagCloseOpen")
        padding = stack[index].padding