diff --git a/CHANGELOG b/CHANGELOG index 5b5d794..d878d0d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,9 +8,17 @@ v0.4.1 (unreleased): includes when denoting tags, but not comments. - Fixed the behavior of preserve_spacing in Template.add() and keep_field in Template.remove() on parameters with hidden keys. +- Removed _ListProxy.detach(). SmartLists now use weak references and their + children are garbage-collected properly. - Fixed parser bugs involving: - templates with completely blank names; - templates with newlines and comments. +- Heavy refactoring and fixes to the C tokenizer, including: + - corrected a design flaw in text handling, allowing for substantial speed + improvements when parsing long strings of plain text; + - implemented new Python 3.3 PEP 393 Unicode APIs. +- Fixed various bugs in SmartList, including one that was causing memory issues + on 64-bit builds of Python 2 on Windows. - Fixed some bugs in the release scripts. v0.4 (released May 23, 2015): diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000..ffefaee --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,64 @@ +# This config file is used by appveyor.com to build Windows release binaries + +version: 0.4.1.dev0-b{build} + +branches: + only: + - master + +skip_tags: true + +environment: + global: + # See: http://stackoverflow.com/a/13751649/163740 + WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd" + PIP: "%WRAPPER% %PYTHON%\\Scripts\\pip.exe" + SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension" + PYPI_USERNAME: "earwigbot" + PYPI_PASSWORD: + secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+ + + matrix: + - PYTHON: "C:\\Python27" + PYTHON_VERSION: "2.7" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python27-x64" + PYTHON_VERSION: "2.7" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python33" + PYTHON_VERSION: "3.3" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python33-x64" + PYTHON_VERSION: "3.3" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python34" + PYTHON_VERSION: "3.4" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python34-x64" + PYTHON_VERSION: "3.4" + PYTHON_ARCH: "64" + +install: + - "%PIP% install wheel twine" + +build_script: + - "%SETUPPY% build" + +test_script: + - "%SETUPPY% -q test" + +after_test: + - "%SETUPPY% bdist_wheel" + +on_success: + - "twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%" + +artifacts: + - path: dist\* + +deploy: off diff --git a/docs/changelog.rst b/docs/changelog.rst index 4e64a8b..f64aba6 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -13,13 +13,24 @@ Unreleased - Added support for Python 3.5. - ``<`` and ``>`` are now disallowed in wikilink titles and template names. This includes when denoting tags, but not comments. -- Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and - *keep_field* in :func:`~.Template.remove` on parameters with hidden keys. +- Fixed the behavior of *preserve_spacing* in :meth:`.Template.add` and + *keep_field* in :meth:`.Template.remove` on parameters with hidden keys. +- Removed :meth:`._ListProxy.detach`. :class:`.SmartList`\ s now use weak + references and their children are garbage-collected properly. - Fixed parser bugs involving: - templates with completely blank names; - templates with newlines and comments. +- Heavy refactoring and fixes to the C tokenizer, including: + + - corrected a design flaw in text handling, allowing for substantial speed + improvements when parsing long strings of plain text; + - implemented new Python 3.3 + `PEP 393 `_ Unicode APIs. + +- Fixed various bugs in :class:`.SmartList`, including one that was causing + memory issues on 64-bit builds of Python 2 on Windows. - Fixed some bugs in the release scripts. v0.4 diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py index 590a271..7a83cd1 100644 --- a/mwparserfromhell/compat.py +++ b/mwparserfromhell/compat.py @@ -18,14 +18,12 @@ if py3k: bytes = bytes str = str range = range - maxsize = sys.maxsize import html.entities as htmlentities else: bytes = str str = unicode range = xrange - maxsize = sys.maxint import htmlentitydefs as htmlentities del sys diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index e0ba16b..cdacb3d 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -81,10 +81,8 @@ def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY -def is_scheme(scheme, slashes=True, reverse=False): +def is_scheme(scheme, slashes=True): """Return whether *scheme* is valid for external links.""" - if reverse: # Convenience for C - scheme = scheme[::-1] scheme = scheme.lower() if slashes: return scheme in URI_SCHEMES diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h new file mode 100644 index 0000000..abade02 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -0,0 +1,125 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html +#endif + +#include +#include +#include + +/* Compatibility macros */ + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +#ifndef uint64_t +#define uint64_t unsigned PY_LONG_LONG +#endif + +#define malloc PyObject_Malloc // XXX: yuck +#define realloc PyObject_Realloc +#define free PyObject_Free + +/* Unicode support macros */ + +#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3 +#define PEP_393 +#endif + +#ifdef PEP_393 +#define Unicode Py_UCS4 +#define PyUnicode_FROM_SINGLE(chr) \ + PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) +#else +#define Unicode Py_UNICODE +#define PyUnicode_FROM_SINGLE(chr) \ + PyUnicode_FromUnicode(&(chr), 1) +#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE +#endif + +/* Error handling macros */ + +#define BAD_ROUTE self->route_state +#define BAD_ROUTE_CONTEXT self->route_context +#define FAIL_ROUTE(context) { \ + self->route_state = 1; \ + self->route_context = context; \ + } +#define RESET_ROUTE() self->route_state = 0 + +/* Shared globals */ + +extern char** entitydefs; + +extern PyObject* NOARGS; +extern PyObject* definitions; + +/* Structs */ + +typedef struct { + Py_ssize_t capacity; + Py_ssize_t length; +#ifdef PEP_393 + PyObject* object; + int kind; + void* data; +#else + Py_UNICODE* data; +#endif +} Textbuffer; + +struct Stack { + PyObject* stack; + uint64_t context; + Textbuffer* textbuffer; + struct Stack* next; +}; +typedef struct Stack Stack; + +typedef struct { + PyObject* object; /* base PyUnicodeObject object */ + Py_ssize_t length; /* length of object, in code points */ +#ifdef PEP_393 + int kind; /* object's kind value */ + void* data; /* object's raw unicode buffer */ +#else + Py_UNICODE* buf; /* object's internal buffer */ +#endif +} TokenizerInput; + +typedef struct { + PyObject_HEAD + TokenizerInput text; /* text to tokenize */ + Stack* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + int global; /* global context */ + int depth; /* stack recursion depth */ + int cycles; /* total number of stack recursions */ + int route_state; /* whether a BadRoute has been triggered */ + uint64_t route_context; /* context when the last BadRoute was triggered */ + int skip_style_tags; /* temp fix for the sometimes broken tag parser */ +} Tokenizer; diff --git a/mwparserfromhell/parser/ctokenizer/contexts.h b/mwparserfromhell/parser/ctokenizer/contexts.h new file mode 100644 index 0000000..4e4a8c7 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/contexts.h @@ -0,0 +1,105 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +/* Local contexts */ + +#define LC_TEMPLATE 0x0000000000000007 +#define LC_TEMPLATE_NAME 0x0000000000000001 +#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 +#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 + +#define LC_ARGUMENT 0x0000000000000018 +#define LC_ARGUMENT_NAME 0x0000000000000008 +#define LC_ARGUMENT_DEFAULT 0x0000000000000010 + +#define LC_WIKILINK 0x0000000000000060 +#define LC_WIKILINK_TITLE 0x0000000000000020 +#define LC_WIKILINK_TEXT 0x0000000000000040 + +#define LC_EXT_LINK 0x0000000000000180 +#define LC_EXT_LINK_URI 0x0000000000000080 +#define LC_EXT_LINK_TITLE 0x0000000000000100 + +#define LC_HEADING 0x0000000000007E00 +#define LC_HEADING_LEVEL_1 0x0000000000000200 +#define LC_HEADING_LEVEL_2 0x0000000000000400 +#define LC_HEADING_LEVEL_3 0x0000000000000800 +#define LC_HEADING_LEVEL_4 0x0000000000001000 +#define LC_HEADING_LEVEL_5 0x0000000000002000 +#define LC_HEADING_LEVEL_6 0x0000000000004000 + +#define LC_TAG 0x0000000000078000 +#define LC_TAG_OPEN 0x0000000000008000 +#define LC_TAG_ATTR 0x0000000000010000 +#define LC_TAG_BODY 0x0000000000020000 +#define LC_TAG_CLOSE 0x0000000000040000 + +#define LC_STYLE 0x0000000000780000 +#define LC_STYLE_ITALICS 0x0000000000080000 +#define LC_STYLE_BOLD 0x0000000000100000 +#define LC_STYLE_PASS_AGAIN 0x0000000000200000 +#define LC_STYLE_SECOND_PASS 0x0000000000400000 + +#define LC_DLTERM 0x0000000000800000 + +#define LC_SAFETY_CHECK 0x000000007F000000 +#define LC_HAS_TEXT 0x0000000001000000 +#define LC_FAIL_ON_TEXT 0x0000000002000000 +#define LC_FAIL_NEXT 0x0000000004000000 +#define LC_FAIL_ON_LBRACE 0x0000000008000000 +#define LC_FAIL_ON_RBRACE 0x0000000010000000 +#define LC_FAIL_ON_EQUALS 0x0000000020000000 +#define LC_HAS_TEMPLATE 0x0000000040000000 + +#define LC_TABLE 0x0000001F80000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000 +#define LC_TABLE_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_OPEN 0x0000000100000000 +#define LC_TABLE_CELL_STYLE 0x0000000200000000 +#define LC_TABLE_ROW_OPEN 0x0000000400000000 +#define LC_TABLE_TD_LINE 0x0000000800000000 +#define LC_TABLE_TH_LINE 0x0000001000000000 + +/* Global contexts */ + +#define GL_HEADING 0x1 + +/* Aggregate contexts */ + +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) +#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) +#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) + +/* Tag contexts */ + +#define TAG_NAME 0x01 +#define TAG_ATTR_READY 0x02 +#define TAG_ATTR_NAME 0x04 +#define TAG_ATTR_VALUE 0x08 +#define TAG_QUOTED 0x10 +#define TAG_NOTE_SPACE 0x20 +#define TAG_NOTE_EQUALS 0x40 +#define TAG_NOTE_QUOTE 0x80 diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.c b/mwparserfromhell/parser/ctokenizer/tag_data.c new file mode 100644 index 0000000..2f67966 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tag_data.c @@ -0,0 +1,78 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "tag_data.h" +#include "contexts.h" + +/* + Initialize a new TagData object. +*/ +TagData* TagData_new(TokenizerInput* text) +{ +#define ALLOC_BUFFER(name) \ + name = Textbuffer_new(text); \ + if (!name) { \ + TagData_dealloc(self); \ + return NULL; \ + } + + TagData *self = malloc(sizeof(TagData)); + if (!self) { + PyErr_NoMemory(); + return NULL; + } + self->context = TAG_NAME; + ALLOC_BUFFER(self->pad_first) + ALLOC_BUFFER(self->pad_before_eq) + ALLOC_BUFFER(self->pad_after_eq) + self->quoter = 0; + self->reset = 0; + return self; + +#undef ALLOC_BUFFER +} + +/* + Deallocate the given TagData object. +*/ +void TagData_dealloc(TagData* self) +{ + if (self->pad_first) + Textbuffer_dealloc(self->pad_first); + if (self->pad_before_eq) + Textbuffer_dealloc(self->pad_before_eq); + if (self->pad_after_eq) + Textbuffer_dealloc(self->pad_after_eq); + free(self); +} + +/* + Clear the internal buffers of the given TagData object. +*/ +int TagData_reset_buffers(TagData* self) +{ + if (Textbuffer_reset(self->pad_first) || + Textbuffer_reset(self->pad_before_eq) || + Textbuffer_reset(self->pad_after_eq)) + return -1; + return 0; +} diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.h b/mwparserfromhell/parser/ctokenizer/tag_data.h new file mode 100644 index 0000000..f184081 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tag_data.h @@ -0,0 +1,43 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#include "common.h" +#include "textbuffer.h" + +/* Structs */ + +typedef struct { + uint64_t context; + Textbuffer* pad_first; + Textbuffer* pad_before_eq; + Textbuffer* pad_after_eq; + Unicode quoter; + Py_ssize_t reset; +} TagData; + +/* Functions */ + +TagData* TagData_new(TokenizerInput*); +void TagData_dealloc(TagData*); +int TagData_reset_buffers(TagData*); diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c new file mode 100644 index 0000000..0c711c5 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c @@ -0,0 +1,232 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "textbuffer.h" + +#define INITIAL_CAPACITY 32 +#define RESIZE_FACTOR 2 +#define CONCAT_EXTRA 32 + +/* + Internal allocation function for textbuffers. +*/ +static int internal_alloc(Textbuffer* self, Unicode maxchar) +{ + self->capacity = INITIAL_CAPACITY; + self->length = 0; + +#ifdef PEP_393 + self->object = PyUnicode_New(self->capacity, maxchar); + if (!self->object) + return -1; + self->kind = PyUnicode_KIND(self->object); + self->data = PyUnicode_DATA(self->object); +#else + (void) maxchar; // Unused + self->data = malloc(sizeof(Unicode) * self->capacity); + if (!self->data) + return -1; +#endif + + return 0; +} + +/* + Internal deallocation function for textbuffers. +*/ +static void internal_dealloc(Textbuffer* self) +{ +#ifdef PEP_393 + Py_DECREF(self->object); +#else + free(self->data); +#endif +} + +/* + Internal resize function. +*/ +static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) +{ +#ifdef PEP_393 + PyObject *newobj; + void *newdata; + + newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object)); + if (!newobj) + return -1; + newdata = PyUnicode_DATA(newobj); + memcpy(newdata, self->data, self->length * self->kind); + Py_DECREF(self->object); + self->object = newobj; + self->data = newdata; +#else + if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap))) + return -1; +#endif + + self->capacity = new_cap; + return 0; +} + +/* + Create a new textbuffer object. +*/ +Textbuffer* Textbuffer_new(TokenizerInput* text) +{ + Textbuffer* self = malloc(sizeof(Textbuffer)); + Unicode maxchar = 0; + +#ifdef PEP_393 + maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); +#endif + + if (!self) + goto fail_nomem; + if (internal_alloc(self, maxchar) < 0) + goto fail_dealloc; + return self; + + fail_dealloc: + free(self); + fail_nomem: + PyErr_NoMemory(); + return NULL; +} + +/* + Deallocate the given textbuffer. +*/ +void Textbuffer_dealloc(Textbuffer* self) +{ + internal_dealloc(self); + free(self); +} + +/* + Reset a textbuffer to its initial, empty state. +*/ +int Textbuffer_reset(Textbuffer* self) +{ + Unicode maxchar = 0; + +#ifdef PEP_393 + maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); +#endif + + internal_dealloc(self); + if (internal_alloc(self, maxchar)) + return -1; + return 0; +} + +/* + Write a Unicode codepoint to the given textbuffer. +*/ +int Textbuffer_write(Textbuffer* self, Unicode code) +{ + if (self->length >= self->capacity) { + if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) + return -1; + } + +#ifdef PEP_393 + PyUnicode_WRITE(self->kind, self->data, self->length++, code); +#else + self->data[self->length++] = code; +#endif + + return 0; +} + +/* + Read a Unicode codepoint from the given index of the given textbuffer. + + This function does not check for bounds. +*/ +Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index) +{ +#ifdef PEP_393 + return PyUnicode_READ(self->kind, self->data, index); +#else + return self->data[index]; +#endif +} + +/* + Return the contents of the textbuffer as a Python Unicode object. +*/ +PyObject* Textbuffer_render(Textbuffer* self) +{ +#ifdef PEP_393 + return PyUnicode_FromKindAndData(self->kind, self->data, self->length); +#else + return PyUnicode_FromUnicode(self->data, self->length); +#endif +} + +/* + Concatenate the 'other' textbuffer onto the end of the given textbuffer. +*/ +int Textbuffer_concat(Textbuffer* self, Textbuffer* other) +{ + Py_ssize_t newlen = self->length + other->length; + + if (newlen > self->capacity) { + if (internal_resize(self, newlen + CONCAT_EXTRA) < 0) + return -1; + } + +#ifdef PEP_393 + assert(self->kind == other->kind); + memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data, + other->length * other->kind); +#else + memcpy(self->data + self->length, other->data, + other->length * sizeof(Unicode)); +#endif + + self->length = newlen; + return 0; +} + +/* + Reverse the contents of the given textbuffer. +*/ +void Textbuffer_reverse(Textbuffer* self) +{ + Py_ssize_t i, end = self->length - 1; + Unicode tmp; + + for (i = 0; i < self->length / 2; i++) { +#ifdef PEP_393 + tmp = PyUnicode_READ(self->kind, self->data, i); + PyUnicode_WRITE(self->kind, self->data, i, + PyUnicode_READ(self->kind, self->data, end - i)); + PyUnicode_WRITE(self->kind, self->data, end - i, tmp); +#else + tmp = self->data[i]; + self->data[i] = self->data[end - i]; + self->data[end - i] = tmp; +#endif + } +} diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h new file mode 100644 index 0000000..123d240 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h @@ -0,0 +1,36 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#include "common.h" + +/* Functions */ + +Textbuffer* Textbuffer_new(TokenizerInput*); +void Textbuffer_dealloc(Textbuffer*); +int Textbuffer_reset(Textbuffer*); +int Textbuffer_write(Textbuffer*, Unicode); +Unicode Textbuffer_read(Textbuffer*, Py_ssize_t); +PyObject* Textbuffer_render(Textbuffer*); +int Textbuffer_concat(Textbuffer*, Textbuffer*); +void Textbuffer_reverse(Textbuffer*); diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c similarity index 73% rename from mwparserfromhell/parser/tokenizer.c rename to mwparserfromhell/parser/ctokenizer/tok_parse.c index f4e801b..23cc246 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,4 @@ /* -Tokenizer for MWParserFromHell Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -21,12 +20,42 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "tokenizer.h" +#include "tok_parse.h" +#include "contexts.h" +#include "tag_data.h" +#include "tok_support.h" +#include "tokens.h" + +#define DIGITS "0123456789" +#define HEXDIGITS "0123456789abcdefABCDEF" +#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + +#define MAX_BRACES 255 +#define MAX_ENTITY_SIZE 8 + +#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) +#define IS_SCHEME(scheme, slashes) \ + (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False)) + +typedef struct { + PyObject* title; + int level; +} HeadingData; + +/* Forward declarations */ + +static int Tokenizer_parse_entity(Tokenizer*); +static int Tokenizer_parse_comment(Tokenizer*); +static int Tokenizer_handle_dl_term(Tokenizer*); +static int Tokenizer_parse_tag(Tokenizer*); /* - Determine whether the given Py_UNICODE is a marker. + Determine whether the given code point is a marker. */ -static int is_marker(Py_UNICODE this) +static int is_marker(Unicode this) { int i; @@ -40,7 +69,7 @@ static int is_marker(Py_UNICODE this) /* Given a context, return the heading level encoded within it. */ -static int heading_level_from_context(int n) +static int heading_level_from_context(uint64_t n) { int level; @@ -51,14 +80,13 @@ static int heading_level_from_context(int n) } /* - Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as + Call the given function in definitions.py, using 'in1' and 'in2' as parameters, and return its output as a bool. */ -static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, - PyObject* in3) +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) { PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -89,496 +117,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) return lowered; } -static Textbuffer* Textbuffer_new(void) -{ - Textbuffer* buffer = malloc(sizeof(Textbuffer)); - - if (!buffer) { - PyErr_NoMemory(); - return NULL; - } - buffer->size = 0; - buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE); - if (!buffer->data) { - free(buffer); - PyErr_NoMemory(); - return NULL; - } - buffer->prev = buffer->next = NULL; - return buffer; -} - -static void Textbuffer_dealloc(Textbuffer* self) -{ - Textbuffer* next; - - while (self) { - free(self->data); - next = self->next; - free(self); - self = next; - } -} - -/* - Write a Unicode codepoint to the given textbuffer. -*/ -static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) -{ - Textbuffer* self = *this; - - if (self->size == TEXTBUFFER_BLOCKSIZE) { - Textbuffer* new = Textbuffer_new(); - if (!new) - return -1; - new->next = self; - self->prev = new; - *this = self = new; - } - self->data[self->size++] = code; - return 0; -} - -/* - Return the contents of the textbuffer as a Python Unicode object. -*/ -static PyObject* Textbuffer_render(Textbuffer* self) -{ - PyObject *result = PyUnicode_FromUnicode(self->data, self->size); - PyObject *left, *concat; - - while (self->next) { - self = self->next; - left = PyUnicode_FromUnicode(self->data, self->size); - concat = PyUnicode_Concat(left, result); - Py_DECREF(left); - Py_DECREF(result); - result = concat; - } - return result; -} - -static TagData* TagData_new(void) -{ - TagData *self = malloc(sizeof(TagData)); - - #define ALLOC_BUFFER(name) \ - name = Textbuffer_new(); \ - if (!name) { \ - TagData_dealloc(self); \ - return NULL; \ - } - - if (!self) { - PyErr_NoMemory(); - return NULL; - } - self->context = TAG_NAME; - ALLOC_BUFFER(self->pad_first) - ALLOC_BUFFER(self->pad_before_eq) - ALLOC_BUFFER(self->pad_after_eq) - self->quoter = self->reset = 0; - return self; -} - -static void TagData_dealloc(TagData* self) -{ - #define DEALLOC_BUFFER(name) \ - if (name) \ - Textbuffer_dealloc(name); - - DEALLOC_BUFFER(self->pad_first); - DEALLOC_BUFFER(self->pad_before_eq); - DEALLOC_BUFFER(self->pad_after_eq); - free(self); -} - -static int TagData_reset_buffers(TagData* self) -{ - #define RESET_BUFFER(name) \ - Textbuffer_dealloc(name); \ - name = Textbuffer_new(); \ - if (!name) \ - return -1; - - RESET_BUFFER(self->pad_first) - RESET_BUFFER(self->pad_before_eq) - RESET_BUFFER(self->pad_after_eq) - return 0; -} - -static PyObject* -Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) -{ - Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0); - return (PyObject*) self; -} - -static void Tokenizer_dealloc(Tokenizer* self) -{ - Stack *this = self->topstack, *next; - Py_XDECREF(self->text); - - while (this) { - Py_DECREF(this->stack); - Textbuffer_dealloc(this->textbuffer); - next = this->next; - free(this); - this = next; - } - Py_TYPE(self)->tp_free((PyObject*) self); -} - -static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) -{ - static char* kwlist[] = {NULL}; - - if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) - return -1; - self->text = Py_None; - Py_INCREF(Py_None); - self->topstack = NULL; - self->head = self->length = self->global = self->depth = self->cycles = 0; - return 0; -} - -/* - Add a new token stack, context, and textbuffer to the list. -*/ -static int Tokenizer_push(Tokenizer* self, uint64_t context) -{ - Stack* top = malloc(sizeof(Stack)); - - if (!top) { - PyErr_NoMemory(); - return -1; - } - top->stack = PyList_New(0); - top->context = context; - top->textbuffer = Textbuffer_new(); - if (!top->textbuffer) - return -1; - top->next = self->topstack; - self->topstack = top; - self->depth++; - self->cycles++; - return 0; -} - -/* - Push the textbuffer onto the stack as a Text node and clear it. -*/ -static int Tokenizer_push_textbuffer(Tokenizer* self) -{ - PyObject *text, *kwargs, *token; - Textbuffer* buffer = self->topstack->textbuffer; - - if (buffer->size == 0 && !buffer->next) - return 0; - text = Textbuffer_render(buffer); - if (!text) - return -1; - kwargs = PyDict_New(); - if (!kwargs) { - Py_DECREF(text); - return -1; - } - PyDict_SetItemString(kwargs, "text", text); - Py_DECREF(text); - token = PyObject_Call(Text, NOARGS, kwargs); - Py_DECREF(kwargs); - if (!token) - return -1; - if (PyList_Append(self->topstack->stack, token)) { - Py_DECREF(token); - return -1; - } - Py_DECREF(token); - Textbuffer_dealloc(buffer); - self->topstack->textbuffer = Textbuffer_new(); - if (!self->topstack->textbuffer) - return -1; - return 0; -} - -/* - Pop and deallocate the top token stack/context/textbuffer. -*/ -static void Tokenizer_delete_top_of_stack(Tokenizer* self) -{ - Stack* top = self->topstack; - - Py_DECREF(top->stack); - Textbuffer_dealloc(top->textbuffer); - self->topstack = top->next; - free(top); - self->depth--; -} - -/* - Pop the current stack/context/textbuffer, returing the stack. -*/ -static PyObject* Tokenizer_pop(Tokenizer* self) -{ - PyObject* stack; - - if (Tokenizer_push_textbuffer(self)) - return NULL; - stack = self->topstack->stack; - Py_INCREF(stack); - Tokenizer_delete_top_of_stack(self); - return stack; -} - -/* - Pop the current stack/context/textbuffer, returing the stack. We will also - replace the underlying stack's context with the current stack's. -*/ -static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) -{ - PyObject* stack; - uint64_t context; - - if (Tokenizer_push_textbuffer(self)) - return NULL; - stack = self->topstack->stack; - Py_INCREF(stack); - context = self->topstack->context; - Tokenizer_delete_top_of_stack(self); - self->topstack->context = context; - return stack; -} - -/* - Fail the current tokenization route. Discards the current - stack/context/textbuffer and sets the BAD_ROUTE flag. -*/ -static void* Tokenizer_fail_route(Tokenizer* self) -{ - uint64_t context = self->topstack->context; - PyObject* stack = Tokenizer_pop(self); - - Py_XDECREF(stack); - FAIL_ROUTE(context); - return NULL; -} - -/* - Write a token to the current token stack. -*/ -static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) -{ - PyObject* instance; - - if (Tokenizer_push_textbuffer(self)) - return -1; - instance = PyObject_CallObject(token, NULL); - if (!instance) - return -1; - if (first ? PyList_Insert(self->topstack->stack, 0, instance) : - PyList_Append(self->topstack->stack, instance)) { - Py_DECREF(instance); - return -1; - } - Py_DECREF(instance); - return 0; -} - -/* - Write a token to the current token stack, with kwargs. Steals a reference - to kwargs. -*/ -static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, - PyObject* kwargs, int first) -{ - PyObject* instance; - - if (Tokenizer_push_textbuffer(self)) { - Py_DECREF(kwargs); - return -1; - } - instance = PyObject_Call(token, NOARGS, kwargs); - if (!instance) { - Py_DECREF(kwargs); - return -1; - } - if (first ? PyList_Insert(self->topstack->stack, 0, instance): - PyList_Append(self->topstack->stack, instance)) { - Py_DECREF(instance); - Py_DECREF(kwargs); - return -1; - } - Py_DECREF(instance); - Py_DECREF(kwargs); - return 0; -} - -/* - Write a Unicode codepoint to the current textbuffer. -*/ -static int Tokenizer_emit_char(Tokenizer* self, Py_UNICODE code) -{ - return Textbuffer_write(&(self->topstack->textbuffer), code); -} - -/* - Write a string of text to the current textbuffer. -*/ -static int Tokenizer_emit_text(Tokenizer* self, const char* text) -{ - int i = 0; - - while (text[i]) { - if (Tokenizer_emit_char(self, text[i])) - return -1; - i++; - } - return 0; -} - -/* - Write the contents of another textbuffer to the current textbuffer, - deallocating it in the process. -*/ -static int -Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse) -{ - Textbuffer *original = buffer; - long i; - - if (reverse) { - do { - for (i = buffer->size - 1; i >= 0; i--) { - if (Tokenizer_emit_char(self, buffer->data[i])) { - Textbuffer_dealloc(original); - return -1; - } - } - } while ((buffer = buffer->next)); - } - else { - while (buffer->next) - buffer = buffer->next; - do { - for (i = 0; i < buffer->size; i++) { - if (Tokenizer_emit_char(self, buffer->data[i])) { - Textbuffer_dealloc(original); - return -1; - } - } - } while ((buffer = buffer->prev)); - } - Textbuffer_dealloc(original); - return 0; -} - -/* - Write a series of tokens to the current stack at once. -*/ -static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) -{ - int pushed = 0; - PyObject *stack, *token, *left, *right, *text; - Textbuffer* buffer; - Py_ssize_t size; - - if (PyList_GET_SIZE(tokenlist) > 0) { - token = PyList_GET_ITEM(tokenlist, 0); - switch (PyObject_IsInstance(token, Text)) { - case 0: - break; - case 1: { - pushed = 1; - buffer = self->topstack->textbuffer; - if (buffer->size == 0 && !buffer->next) - break; - left = Textbuffer_render(buffer); - if (!left) - return -1; - right = PyObject_GetAttrString(token, "text"); - if (!right) - return -1; - text = PyUnicode_Concat(left, right); - Py_DECREF(left); - Py_DECREF(right); - if (!text) - return -1; - if (PyObject_SetAttrString(token, "text", text)) { - Py_DECREF(text); - return -1; - } - Py_DECREF(text); - Textbuffer_dealloc(buffer); - self->topstack->textbuffer = Textbuffer_new(); - if (!self->topstack->textbuffer) - return -1; - break; - } - case -1: - return -1; - } - } - if (!pushed) { - if (Tokenizer_push_textbuffer(self)) - return -1; - } - stack = self->topstack->stack; - size = PyList_GET_SIZE(stack); - if (PyList_SetSlice(stack, size, size, tokenlist)) - return -1; - return 0; -} - -/* - Pop the current stack, write text, and then write the stack. 'text' is a - NULL-terminated array of chars. -*/ -static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) -{ - PyObject* stack = Tokenizer_pop(self); - - if (Tokenizer_emit_text(self, text)) { - Py_DECREF(stack); - return -1; - } - if (stack) { - if (PyList_GET_SIZE(stack) > 0) { - if (Tokenizer_emit_all(self, stack)) { - Py_DECREF(stack); - return -1; - } - } - Py_DECREF(stack); - } - self->head--; - return 0; -} - -/* - Read the value at a relative point in the wikicode, forwards. -*/ -static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) -{ - Py_ssize_t index = self->head + delta; - - if (index >= self->length) - return EMPTY; - return PyList_GET_ITEM(self->text, index); -} - -/* - Read the value at a relative point in the wikicode, backwards. -*/ -static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) -{ - Py_ssize_t index; - - if (delta > self->head) - return EMPTY; - index = self->head - delta; - return PyList_GET_ITEM(self->text, index); -} - /* Parse a template at the head of the wikicode string. */ @@ -651,7 +189,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) PyObject *tokenlist; self->head += 2; - while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) { + while (Tokenizer_read(self, 0) == '{' && braces < MAX_BRACES) { self->head++; braces++; } @@ -882,21 +420,21 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; Textbuffer* buffer; PyObject* scheme; - Py_UNICODE this; + Unicode this; int slashes, i; if (Tokenizer_push(self, LC_EXT_LINK_URI)) return -1; - if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') { + if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') { if (Tokenizer_emit_text(self, "//")) return -1; self->head += 2; } else { - buffer = Textbuffer_new(); + buffer = Textbuffer_new(&self->text); if (!buffer) return -1; - while ((this = Tokenizer_READ(self, 0))) { + while ((this = Tokenizer_read(self, 0))) { i = 0; while (1) { if (!valid[i]) @@ -905,7 +443,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) break; i++; } - Textbuffer_write(&buffer, this); + Textbuffer_write(buffer, this); if (Tokenizer_emit_char(self, this)) { Textbuffer_dealloc(buffer); return -1; @@ -923,8 +461,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) return -1; } self->head++; - slashes = (Tokenizer_READ(self, 0) == '/' && - Tokenizer_READ(self, 1) == '/'); + slashes = (Tokenizer_read(self, 0) == '/' && + Tokenizer_read(self, 1) == '/'); if (slashes) { if (Tokenizer_emit_text(self, "//")) { Textbuffer_dealloc(buffer); @@ -936,7 +474,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) Textbuffer_dealloc(buffer); if (!scheme) return -1; - if (!IS_SCHEME(scheme, slashes, 0)) { + if (!IS_SCHEME(scheme, slashes)) { Py_DECREF(scheme); Tokenizer_fail_route(self); return 0; @@ -952,46 +490,40 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) { static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; - Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; + Textbuffer *scheme_buffer = Textbuffer_new(&self->text); PyObject *scheme; - Py_UNICODE chunk; - long i; + Unicode chunk; + Py_ssize_t i; int slashes, j; if (!scheme_buffer) return -1; // We have to backtrack through the textbuffer looking for our scheme since // it was just parsed as text: - temp_buffer = self->topstack->textbuffer; - while (temp_buffer) { - for (i = temp_buffer->size - 1; i >= 0; i--) { - chunk = temp_buffer->data[i]; - if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) - goto end_of_loop; - j = 0; - while (1) { - if (!valid[j]) { - Textbuffer_dealloc(scheme_buffer); - FAIL_ROUTE(0); - return 0; - } - if (chunk == valid[j]) - break; - j++; + for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { + chunk = Textbuffer_read(self->topstack->textbuffer, i); + if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + goto end_of_loop; + j = 0; + do { + if (!valid[j]) { + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; } - Textbuffer_write(&scheme_buffer, chunk); - } - temp_buffer = temp_buffer->next; + } while (chunk != valid[j++]); + Textbuffer_write(scheme_buffer, chunk); } end_of_loop: + Textbuffer_reverse(scheme_buffer); scheme = Textbuffer_render(scheme_buffer); if (!scheme) { Textbuffer_dealloc(scheme_buffer); return -1; } - slashes = (Tokenizer_READ(self, 0) == '/' && - Tokenizer_READ(self, 1) == '/'); - if (!IS_SCHEME(scheme, slashes, 1)) { + slashes = (Tokenizer_read(self, 0) == '/' && + Tokenizer_read(self, 1) == '/'); + if (!IS_SCHEME(scheme, slashes)) { Py_DECREF(scheme); Textbuffer_dealloc(scheme_buffer); FAIL_ROUTE(0); @@ -1002,7 +534,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) + if (Tokenizer_emit_textbuffer(self, scheme_buffer)) return -1; if (Tokenizer_emit_char(self, ':')) return -1; @@ -1017,29 +549,27 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) /* Handle text in a free external link, including trailing punctuation. */ -static int -Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, - Textbuffer** tail, Py_UNICODE this) +static int Tokenizer_handle_free_link_text( + Tokenizer* self, int* parens, Textbuffer* tail, Unicode this) { - #define PUSH_TAIL_BUFFER(tail, error) \ - if ((tail)->size || (tail)->next) { \ - if (Tokenizer_emit_textbuffer(self, tail, 0)) \ - return error; \ - tail = Textbuffer_new(); \ - if (!(tail)) \ - return error; \ + #define PUSH_TAIL_BUFFER(tail, error) \ + if (tail->length > 0) { \ + if (Textbuffer_concat(self->topstack->textbuffer, tail)) \ + return error; \ + if (Textbuffer_reset(tail)) \ + return error; \ } if (this == '(' && !(*parens)) { *parens = 1; - PUSH_TAIL_BUFFER(*tail, -1) + PUSH_TAIL_BUFFER(tail, -1) } else if (this == ',' || this == ';' || this == '\\' || this == '.' || this == ':' || this == '!' || this == '?' || (!(*parens) && this == ')')) return Textbuffer_write(tail, this); else - PUSH_TAIL_BUFFER(*tail, -1) + PUSH_TAIL_BUFFER(tail, -1) return Tokenizer_emit_char(self, this); } @@ -1047,10 +577,10 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Return whether the current head is the end of a free link. */ static int -Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) +Tokenizer_is_free_link(Tokenizer* self, Unicode this, Unicode next) { // Built from Tokenizer_parse()'s end sentinels: - Py_UNICODE after = Tokenizer_READ(self, 2); + Unicode after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || @@ -1066,9 +596,9 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) */ static PyObject* Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, - Textbuffer** extra) + Textbuffer* extra) { - Py_UNICODE this, next; + Unicode this, next; int parens = 0; if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : @@ -1076,23 +606,23 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, return NULL; if (BAD_ROUTE) return NULL; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this || this == '\n' || this == ' ' || this == ']') return Tokenizer_fail_route(self); if (!brackets && this == '[') return Tokenizer_fail_route(self); while (1) { - this = Tokenizer_READ(self, 0); - next = Tokenizer_READ(self, 1); + this = Tokenizer_read(self, 0); + next = Tokenizer_read(self, 1); if (this == '&') { - PUSH_TAIL_BUFFER(*extra, NULL) + PUSH_TAIL_BUFFER(extra, NULL) if (Tokenizer_parse_entity(self)) return NULL; } else if (this == '<' && next == '!' - && Tokenizer_READ(self, 2) == '-' - && Tokenizer_READ(self, 3) == '-') { - PUSH_TAIL_BUFFER(*extra, NULL) + && Tokenizer_read(self, 2) == '-' + && Tokenizer_read(self, 3) == '-') { + PUSH_TAIL_BUFFER(extra, NULL) if (Tokenizer_parse_comment(self)) return NULL; } @@ -1103,7 +633,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, else if (!this || this == '\n') return Tokenizer_fail_route(self); else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) { - PUSH_TAIL_BUFFER(*extra, NULL) + PUSH_TAIL_BUFFER(extra, NULL) if (Tokenizer_parse_template_or_argument(self)) return NULL; } @@ -1143,7 +673,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), *split, *scheme; Py_ssize_t length; - Textbuffer* temp; if (!text) return -1; @@ -1152,19 +681,9 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) if (!split) return -1; scheme = PyList_GET_ITEM(split, 0); - length = PyUnicode_GET_SIZE(scheme); - while (length) { - temp = self->topstack->textbuffer; - if (length <= temp->size) { - temp->size -= length; - break; - } - length -= temp->size; - self->topstack->textbuffer = temp->next; - free(temp->data); - free(temp); - } + length = PyUnicode_GET_LENGTH(scheme); Py_DECREF(split); + self->topstack->textbuffer->length -= length; return 0; } @@ -1177,20 +696,20 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) #define NOT_A_LINK \ if (!brackets && self->topstack->context & LC_DLTERM) \ return Tokenizer_handle_dl_term(self); \ - return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + return Tokenizer_emit_char(self, Tokenizer_read(self, 0)) Py_ssize_t reset = self->head; PyObject *link, *kwargs; - Textbuffer *extra = 0; + Textbuffer *extra; if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { NOT_A_LINK; } - extra = Textbuffer_new(); + extra = Textbuffer_new(&self->text); if (!extra) return -1; self->head++; - link = Tokenizer_really_parse_external_link(self, brackets, &extra); + link = Tokenizer_really_parse_external_link(self, brackets, extra); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; @@ -1230,8 +749,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) Textbuffer_dealloc(extra); return -1; } - if (extra->size || extra->next) - return Tokenizer_emit_textbuffer(self, extra, 0); + if (extra->length > 0) + return Tokenizer_emit_textbuffer(self, extra); Textbuffer_dealloc(extra); return 0; } @@ -1248,7 +767,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global |= GL_HEADING; self->head += 1; - while (Tokenizer_READ(self, 0) == '=') { + while (Tokenizer_read(self, 0) == '=') { best++; self->head++; } @@ -1264,7 +783,11 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } - level = NEW_INT_FUNC(heading->level); +#ifdef IS_PY3K + level = PyLong_FromSsize_t(heading->level); +#else + level = PyInt_FromSsize_t(heading->level); +#endif if (!level) { Py_DECREF(heading->title); free(heading); @@ -1319,7 +842,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) self->head += 1; best = 1; - while (Tokenizer_READ(self, 0) == '=') { + while (Tokenizer_read(self, 0) == '=') { best++; self->head++; } @@ -1373,8 +896,8 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) */ static int Tokenizer_really_parse_entity(Tokenizer* self) { - PyObject *kwargs, *textobj; - Py_UNICODE this; + PyObject *kwargs, *charobj, *textobj; + Unicode this; int numeric, hexadecimal, i, j, zeroes, test; char *valid, *text, *buffer, *def; @@ -1387,7 +910,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) if (Tokenizer_emit(self, HTMLEntityStart)) return -1; self->head++; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { Tokenizer_fail_route(self); return 0; @@ -1397,7 +920,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) if (Tokenizer_emit(self, HTMLEntityNumeric)) return -1; self->head++; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { Tokenizer_fail_route(self); return 0; @@ -1407,7 +930,12 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) kwargs = PyDict_New(); if (!kwargs) return -1; - PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0)); + if (!(charobj = PyUnicode_FROM_SINGLE(this))) { + Py_DECREF(kwargs); + return -1; + } + PyDict_SetItemString(kwargs, "char", charobj); + Py_DECREF(charobj); if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs)) return -1; self->head++; @@ -1431,7 +959,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) i = 0; zeroes = 0; while (1) { - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (this == ';') { if (i == 0) FAIL_ROUTE_AND_EXIT() @@ -1544,21 +1072,21 @@ static int Tokenizer_parse_comment(Tokenizer* self) { Py_ssize_t reset = self->head + 3; PyObject *comment; - Py_UNICODE this; + Unicode this; self->head += 4; if (Tokenizer_push(self, 0)) return -1; while (1) { - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { comment = Tokenizer_pop(self); Py_XDECREF(comment); self->head = reset; return Tokenizer_emit_text(self, "