diff --git a/CHANGELOG b/CHANGELOG
index 5b5d794..d878d0d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,9 +8,17 @@ v0.4.1 (unreleased):
includes when denoting tags, but not comments.
- Fixed the behavior of preserve_spacing in Template.add() and keep_field in
Template.remove() on parameters with hidden keys.
+- Removed _ListProxy.detach(). SmartLists now use weak references and their
+ children are garbage-collected properly.
- Fixed parser bugs involving:
- templates with completely blank names;
- templates with newlines and comments.
+- Heavy refactoring and fixes to the C tokenizer, including:
+ - corrected a design flaw in text handling, allowing for substantial speed
+ improvements when parsing long strings of plain text;
+ - implemented new Python 3.3 PEP 393 Unicode APIs.
+- Fixed various bugs in SmartList, including one that was causing memory issues
+ on 64-bit builds of Python 2 on Windows.
- Fixed some bugs in the release scripts.
v0.4 (released May 23, 2015):
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..ffefaee
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,64 @@
+# This config file is used by appveyor.com to build Windows release binaries
+
+version: 0.4.1.dev0-b{build}
+
+branches:
+ only:
+ - master
+
+skip_tags: true
+
+environment:
+ global:
+ # See: http://stackoverflow.com/a/13751649/163740
+ WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd"
+ PIP: "%WRAPPER% %PYTHON%\\Scripts\\pip.exe"
+ SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension"
+ PYPI_USERNAME: "earwigbot"
+ PYPI_PASSWORD:
+ secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+
+
+ matrix:
+ - PYTHON: "C:\\Python27"
+ PYTHON_VERSION: "2.7"
+ PYTHON_ARCH: "32"
+
+ - PYTHON: "C:\\Python27-x64"
+ PYTHON_VERSION: "2.7"
+ PYTHON_ARCH: "64"
+
+ - PYTHON: "C:\\Python33"
+ PYTHON_VERSION: "3.3"
+ PYTHON_ARCH: "32"
+
+ - PYTHON: "C:\\Python33-x64"
+ PYTHON_VERSION: "3.3"
+ PYTHON_ARCH: "64"
+
+ - PYTHON: "C:\\Python34"
+ PYTHON_VERSION: "3.4"
+ PYTHON_ARCH: "32"
+
+ - PYTHON: "C:\\Python34-x64"
+ PYTHON_VERSION: "3.4"
+ PYTHON_ARCH: "64"
+
+install:
+ - "%PIP% install wheel twine"
+
+build_script:
+ - "%SETUPPY% build"
+
+test_script:
+ - "%SETUPPY% -q test"
+
+after_test:
+ - "%SETUPPY% bdist_wheel"
+
+on_success:
+ - "twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%"
+
+artifacts:
+ - path: dist\*
+
+deploy: off
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 4e64a8b..f64aba6 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -13,13 +13,24 @@ Unreleased
- Added support for Python 3.5.
- ``<`` and ``>`` are now disallowed in wikilink titles and template names.
This includes when denoting tags, but not comments.
-- Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and
- *keep_field* in :func:`~.Template.remove` on parameters with hidden keys.
+- Fixed the behavior of *preserve_spacing* in :meth:`.Template.add` and
+ *keep_field* in :meth:`.Template.remove` on parameters with hidden keys.
+- Removed :meth:`._ListProxy.detach`. :class:`.SmartList`\ s now use weak
+ references and their children are garbage-collected properly.
- Fixed parser bugs involving:
- templates with completely blank names;
- templates with newlines and comments.
+- Heavy refactoring and fixes to the C tokenizer, including:
+
+ - corrected a design flaw in text handling, allowing for substantial speed
+ improvements when parsing long strings of plain text;
+ - implemented new Python 3.3
+ `PEP 393 `_ Unicode APIs.
+
+- Fixed various bugs in :class:`.SmartList`, including one that was causing
+ memory issues on 64-bit builds of Python 2 on Windows.
- Fixed some bugs in the release scripts.
v0.4
diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py
index 590a271..7a83cd1 100644
--- a/mwparserfromhell/compat.py
+++ b/mwparserfromhell/compat.py
@@ -18,14 +18,12 @@ if py3k:
bytes = bytes
str = str
range = range
- maxsize = sys.maxsize
import html.entities as htmlentities
else:
bytes = str
str = unicode
range = xrange
- maxsize = sys.maxint
import htmlentitydefs as htmlentities
del sys
diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py
index e0ba16b..cdacb3d 100644
--- a/mwparserfromhell/definitions.py
+++ b/mwparserfromhell/definitions.py
@@ -81,10 +81,8 @@ def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY
-def is_scheme(scheme, slashes=True, reverse=False):
+def is_scheme(scheme, slashes=True):
"""Return whether *scheme* is valid for external links."""
- if reverse: # Convenience for C
- scheme = scheme[::-1]
scheme = scheme.lower()
if slashes:
return scheme in URI_SCHEMES
diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
new file mode 100644
index 0000000..abade02
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -0,0 +1,125 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html
+#endif
+
+#include
+#include
+#include
+
+/* Compatibility macros */
+
+#if PY_MAJOR_VERSION >= 3
+#define IS_PY3K
+#endif
+
+#ifndef uint64_t
+#define uint64_t unsigned PY_LONG_LONG
+#endif
+
+#define malloc PyObject_Malloc // XXX: yuck
+#define realloc PyObject_Realloc
+#define free PyObject_Free
+
+/* Unicode support macros */
+
+#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3
+#define PEP_393
+#endif
+
+#ifdef PEP_393
+#define Unicode Py_UCS4
+#define PyUnicode_FROM_SINGLE(chr) \
+ PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)
+#else
+#define Unicode Py_UNICODE
+#define PyUnicode_FROM_SINGLE(chr) \
+ PyUnicode_FromUnicode(&(chr), 1)
+#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
+#endif
+
+/* Error handling macros */
+
+#define BAD_ROUTE self->route_state
+#define BAD_ROUTE_CONTEXT self->route_context
+#define FAIL_ROUTE(context) { \
+ self->route_state = 1; \
+ self->route_context = context; \
+ }
+#define RESET_ROUTE() self->route_state = 0
+
+/* Shared globals */
+
+extern char** entitydefs;
+
+extern PyObject* NOARGS;
+extern PyObject* definitions;
+
+/* Structs */
+
+typedef struct {
+ Py_ssize_t capacity;
+ Py_ssize_t length;
+#ifdef PEP_393
+ PyObject* object;
+ int kind;
+ void* data;
+#else
+ Py_UNICODE* data;
+#endif
+} Textbuffer;
+
+struct Stack {
+ PyObject* stack;
+ uint64_t context;
+ Textbuffer* textbuffer;
+ struct Stack* next;
+};
+typedef struct Stack Stack;
+
+typedef struct {
+ PyObject* object; /* base PyUnicodeObject object */
+ Py_ssize_t length; /* length of object, in code points */
+#ifdef PEP_393
+ int kind; /* object's kind value */
+ void* data; /* object's raw unicode buffer */
+#else
+ Py_UNICODE* buf; /* object's internal buffer */
+#endif
+} TokenizerInput;
+
+typedef struct {
+ PyObject_HEAD
+ TokenizerInput text; /* text to tokenize */
+ Stack* topstack; /* topmost stack */
+ Py_ssize_t head; /* current position in text */
+ int global; /* global context */
+ int depth; /* stack recursion depth */
+ int cycles; /* total number of stack recursions */
+ int route_state; /* whether a BadRoute has been triggered */
+ uint64_t route_context; /* context when the last BadRoute was triggered */
+ int skip_style_tags; /* temp fix for the sometimes broken tag parser */
+} Tokenizer;
diff --git a/mwparserfromhell/parser/ctokenizer/contexts.h b/mwparserfromhell/parser/ctokenizer/contexts.h
new file mode 100644
index 0000000..4e4a8c7
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/contexts.h
@@ -0,0 +1,105 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+/* Local contexts */
+
+#define LC_TEMPLATE 0x0000000000000007
+#define LC_TEMPLATE_NAME 0x0000000000000001
+#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002
+#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004
+
+#define LC_ARGUMENT 0x0000000000000018
+#define LC_ARGUMENT_NAME 0x0000000000000008
+#define LC_ARGUMENT_DEFAULT 0x0000000000000010
+
+#define LC_WIKILINK 0x0000000000000060
+#define LC_WIKILINK_TITLE 0x0000000000000020
+#define LC_WIKILINK_TEXT 0x0000000000000040
+
+#define LC_EXT_LINK 0x0000000000000180
+#define LC_EXT_LINK_URI 0x0000000000000080
+#define LC_EXT_LINK_TITLE 0x0000000000000100
+
+#define LC_HEADING 0x0000000000007E00
+#define LC_HEADING_LEVEL_1 0x0000000000000200
+#define LC_HEADING_LEVEL_2 0x0000000000000400
+#define LC_HEADING_LEVEL_3 0x0000000000000800
+#define LC_HEADING_LEVEL_4 0x0000000000001000
+#define LC_HEADING_LEVEL_5 0x0000000000002000
+#define LC_HEADING_LEVEL_6 0x0000000000004000
+
+#define LC_TAG 0x0000000000078000
+#define LC_TAG_OPEN 0x0000000000008000
+#define LC_TAG_ATTR 0x0000000000010000
+#define LC_TAG_BODY 0x0000000000020000
+#define LC_TAG_CLOSE 0x0000000000040000
+
+#define LC_STYLE 0x0000000000780000
+#define LC_STYLE_ITALICS 0x0000000000080000
+#define LC_STYLE_BOLD 0x0000000000100000
+#define LC_STYLE_PASS_AGAIN 0x0000000000200000
+#define LC_STYLE_SECOND_PASS 0x0000000000400000
+
+#define LC_DLTERM 0x0000000000800000
+
+#define LC_SAFETY_CHECK 0x000000007F000000
+#define LC_HAS_TEXT 0x0000000001000000
+#define LC_FAIL_ON_TEXT 0x0000000002000000
+#define LC_FAIL_NEXT 0x0000000004000000
+#define LC_FAIL_ON_LBRACE 0x0000000008000000
+#define LC_FAIL_ON_RBRACE 0x0000000010000000
+#define LC_FAIL_ON_EQUALS 0x0000000020000000
+#define LC_HAS_TEMPLATE 0x0000000040000000
+
+#define LC_TABLE 0x0000001F80000000
+#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000
+#define LC_TABLE_OPEN 0x0000000080000000
+#define LC_TABLE_CELL_OPEN 0x0000000100000000
+#define LC_TABLE_CELL_STYLE 0x0000000200000000
+#define LC_TABLE_ROW_OPEN 0x0000000400000000
+#define LC_TABLE_TD_LINE 0x0000000800000000
+#define LC_TABLE_TH_LINE 0x0000001000000000
+
+/* Global contexts */
+
+#define GL_HEADING 0x1
+
+/* Aggregate contexts */
+
+#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
+#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
+#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
+#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
+#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)
+
+/* Tag contexts */
+
+#define TAG_NAME 0x01
+#define TAG_ATTR_READY 0x02
+#define TAG_ATTR_NAME 0x04
+#define TAG_ATTR_VALUE 0x08
+#define TAG_QUOTED 0x10
+#define TAG_NOTE_SPACE 0x20
+#define TAG_NOTE_EQUALS 0x40
+#define TAG_NOTE_QUOTE 0x80
diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.c b/mwparserfromhell/parser/ctokenizer/tag_data.c
new file mode 100644
index 0000000..2f67966
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tag_data.c
@@ -0,0 +1,78 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "tag_data.h"
+#include "contexts.h"
+
+/*
+ Initialize a new TagData object.
+*/
+TagData* TagData_new(TokenizerInput* text)
+{
+#define ALLOC_BUFFER(name) \
+ name = Textbuffer_new(text); \
+ if (!name) { \
+ TagData_dealloc(self); \
+ return NULL; \
+ }
+
+ TagData *self = malloc(sizeof(TagData));
+ if (!self) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ self->context = TAG_NAME;
+ ALLOC_BUFFER(self->pad_first)
+ ALLOC_BUFFER(self->pad_before_eq)
+ ALLOC_BUFFER(self->pad_after_eq)
+ self->quoter = 0;
+ self->reset = 0;
+ return self;
+
+#undef ALLOC_BUFFER
+}
+
+/*
+ Deallocate the given TagData object.
+*/
+void TagData_dealloc(TagData* self)
+{
+ if (self->pad_first)
+ Textbuffer_dealloc(self->pad_first);
+ if (self->pad_before_eq)
+ Textbuffer_dealloc(self->pad_before_eq);
+ if (self->pad_after_eq)
+ Textbuffer_dealloc(self->pad_after_eq);
+ free(self);
+}
+
+/*
+ Clear the internal buffers of the given TagData object.
+*/
+int TagData_reset_buffers(TagData* self)
+{
+ if (Textbuffer_reset(self->pad_first) ||
+ Textbuffer_reset(self->pad_before_eq) ||
+ Textbuffer_reset(self->pad_after_eq))
+ return -1;
+ return 0;
+}
diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.h b/mwparserfromhell/parser/ctokenizer/tag_data.h
new file mode 100644
index 0000000..f184081
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tag_data.h
@@ -0,0 +1,43 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include "common.h"
+#include "textbuffer.h"
+
+/* Structs */
+
+typedef struct {
+ uint64_t context;
+ Textbuffer* pad_first;
+ Textbuffer* pad_before_eq;
+ Textbuffer* pad_after_eq;
+ Unicode quoter;
+ Py_ssize_t reset;
+} TagData;
+
+/* Functions */
+
+TagData* TagData_new(TokenizerInput*);
+void TagData_dealloc(TagData*);
+int TagData_reset_buffers(TagData*);
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c
new file mode 100644
index 0000000..0c711c5
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c
@@ -0,0 +1,232 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "textbuffer.h"
+
+#define INITIAL_CAPACITY 32
+#define RESIZE_FACTOR 2
+#define CONCAT_EXTRA 32
+
+/*
+ Internal allocation function for textbuffers.
+*/
+static int internal_alloc(Textbuffer* self, Unicode maxchar)
+{
+ self->capacity = INITIAL_CAPACITY;
+ self->length = 0;
+
+#ifdef PEP_393
+ self->object = PyUnicode_New(self->capacity, maxchar);
+ if (!self->object)
+ return -1;
+ self->kind = PyUnicode_KIND(self->object);
+ self->data = PyUnicode_DATA(self->object);
+#else
+ (void) maxchar; // Unused
+ self->data = malloc(sizeof(Unicode) * self->capacity);
+ if (!self->data)
+ return -1;
+#endif
+
+ return 0;
+}
+
+/*
+ Internal deallocation function for textbuffers.
+*/
+static void internal_dealloc(Textbuffer* self)
+{
+#ifdef PEP_393
+ Py_DECREF(self->object);
+#else
+ free(self->data);
+#endif
+}
+
+/*
+ Internal resize function.
+*/
+static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
+{
+#ifdef PEP_393
+ PyObject *newobj;
+ void *newdata;
+
+ newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
+ if (!newobj)
+ return -1;
+ newdata = PyUnicode_DATA(newobj);
+ memcpy(newdata, self->data, self->length * self->kind);
+ Py_DECREF(self->object);
+ self->object = newobj;
+ self->data = newdata;
+#else
+ if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap)))
+ return -1;
+#endif
+
+ self->capacity = new_cap;
+ return 0;
+}
+
+/*
+ Create a new textbuffer object.
+*/
+Textbuffer* Textbuffer_new(TokenizerInput* text)
+{
+ Textbuffer* self = malloc(sizeof(Textbuffer));
+ Unicode maxchar = 0;
+
+#ifdef PEP_393
+ maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);
+#endif
+
+ if (!self)
+ goto fail_nomem;
+ if (internal_alloc(self, maxchar) < 0)
+ goto fail_dealloc;
+ return self;
+
+ fail_dealloc:
+ free(self);
+ fail_nomem:
+ PyErr_NoMemory();
+ return NULL;
+}
+
+/*
+ Deallocate the given textbuffer.
+*/
+void Textbuffer_dealloc(Textbuffer* self)
+{
+ internal_dealloc(self);
+ free(self);
+}
+
+/*
+ Reset a textbuffer to its initial, empty state.
+*/
+int Textbuffer_reset(Textbuffer* self)
+{
+ Unicode maxchar = 0;
+
+#ifdef PEP_393
+ maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);
+#endif
+
+ internal_dealloc(self);
+ if (internal_alloc(self, maxchar))
+ return -1;
+ return 0;
+}
+
+/*
+ Write a Unicode codepoint to the given textbuffer.
+*/
+int Textbuffer_write(Textbuffer* self, Unicode code)
+{
+ if (self->length >= self->capacity) {
+ if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
+ return -1;
+ }
+
+#ifdef PEP_393
+ PyUnicode_WRITE(self->kind, self->data, self->length++, code);
+#else
+ self->data[self->length++] = code;
+#endif
+
+ return 0;
+}
+
+/*
+ Read a Unicode codepoint from the given index of the given textbuffer.
+
+ This function does not check for bounds.
+*/
+Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
+{
+#ifdef PEP_393
+ return PyUnicode_READ(self->kind, self->data, index);
+#else
+ return self->data[index];
+#endif
+}
+
+/*
+ Return the contents of the textbuffer as a Python Unicode object.
+*/
+PyObject* Textbuffer_render(Textbuffer* self)
+{
+#ifdef PEP_393
+ return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
+#else
+ return PyUnicode_FromUnicode(self->data, self->length);
+#endif
+}
+
+/*
+ Concatenate the 'other' textbuffer onto the end of the given textbuffer.
+*/
+int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
+{
+ Py_ssize_t newlen = self->length + other->length;
+
+ if (newlen > self->capacity) {
+ if (internal_resize(self, newlen + CONCAT_EXTRA) < 0)
+ return -1;
+ }
+
+#ifdef PEP_393
+ assert(self->kind == other->kind);
+ memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
+ other->length * other->kind);
+#else
+ memcpy(self->data + self->length, other->data,
+ other->length * sizeof(Unicode));
+#endif
+
+ self->length = newlen;
+ return 0;
+}
+
+/*
+ Reverse the contents of the given textbuffer.
+*/
+void Textbuffer_reverse(Textbuffer* self)
+{
+ Py_ssize_t i, end = self->length - 1;
+ Unicode tmp;
+
+ for (i = 0; i < self->length / 2; i++) {
+#ifdef PEP_393
+ tmp = PyUnicode_READ(self->kind, self->data, i);
+ PyUnicode_WRITE(self->kind, self->data, i,
+ PyUnicode_READ(self->kind, self->data, end - i));
+ PyUnicode_WRITE(self->kind, self->data, end - i, tmp);
+#else
+ tmp = self->data[i];
+ self->data[i] = self->data[end - i];
+ self->data[end - i] = tmp;
+#endif
+ }
+}
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h
new file mode 100644
index 0000000..123d240
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h
@@ -0,0 +1,36 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include "common.h"
+
+/* Functions */
+
+Textbuffer* Textbuffer_new(TokenizerInput*);
+void Textbuffer_dealloc(Textbuffer*);
+int Textbuffer_reset(Textbuffer*);
+int Textbuffer_write(Textbuffer*, Unicode);
+Unicode Textbuffer_read(Textbuffer*, Py_ssize_t);
+PyObject* Textbuffer_render(Textbuffer*);
+int Textbuffer_concat(Textbuffer*, Textbuffer*);
+void Textbuffer_reverse(Textbuffer*);
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
similarity index 73%
rename from mwparserfromhell/parser/tokenizer.c
rename to mwparserfromhell/parser/ctokenizer/tok_parse.c
index f4e801b..23cc246 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -1,5 +1,4 @@
/*
-Tokenizer for MWParserFromHell
Copyright (C) 2012-2015 Ben Kurtovic
Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -21,12 +20,42 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
-#include "tokenizer.h"
+#include "tok_parse.h"
+#include "contexts.h"
+#include "tag_data.h"
+#include "tok_support.h"
+#include "tokens.h"
+
+#define DIGITS "0123456789"
+#define HEXDIGITS "0123456789abcdefABCDEF"
+#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+#define MAX_BRACES 255
+#define MAX_ENTITY_SIZE 8
+
+#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
+#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL))
+#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL))
+#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL))
+#define IS_SCHEME(scheme, slashes) \
+ (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False))
+
+typedef struct {
+ PyObject* title;
+ int level;
+} HeadingData;
+
+/* Forward declarations */
+
+static int Tokenizer_parse_entity(Tokenizer*);
+static int Tokenizer_parse_comment(Tokenizer*);
+static int Tokenizer_handle_dl_term(Tokenizer*);
+static int Tokenizer_parse_tag(Tokenizer*);
/*
- Determine whether the given Py_UNICODE is a marker.
+ Determine whether the given code point is a marker.
*/
-static int is_marker(Py_UNICODE this)
+static int is_marker(Unicode this)
{
int i;
@@ -40,7 +69,7 @@ static int is_marker(Py_UNICODE this)
/*
Given a context, return the heading level encoded within it.
*/
-static int heading_level_from_context(int n)
+static int heading_level_from_context(uint64_t n)
{
int level;
@@ -51,14 +80,13 @@ static int heading_level_from_context(int n)
}
/*
- Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as
+ Call the given function in definitions.py, using 'in1' and 'in2' as
parameters, and return its output as a bool.
*/
-static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
- PyObject* in3)
+static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2)
{
PyObject* func = PyObject_GetAttrString(definitions, funcname);
- PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL);
+ PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL);
int ans = (result == Py_True) ? 1 : 0;
Py_DECREF(func);
@@ -89,496 +117,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
return lowered;
}
-static Textbuffer* Textbuffer_new(void)
-{
- Textbuffer* buffer = malloc(sizeof(Textbuffer));
-
- if (!buffer) {
- PyErr_NoMemory();
- return NULL;
- }
- buffer->size = 0;
- buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
- if (!buffer->data) {
- free(buffer);
- PyErr_NoMemory();
- return NULL;
- }
- buffer->prev = buffer->next = NULL;
- return buffer;
-}
-
-static void Textbuffer_dealloc(Textbuffer* self)
-{
- Textbuffer* next;
-
- while (self) {
- free(self->data);
- next = self->next;
- free(self);
- self = next;
- }
-}
-
-/*
- Write a Unicode codepoint to the given textbuffer.
-*/
-static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
-{
- Textbuffer* self = *this;
-
- if (self->size == TEXTBUFFER_BLOCKSIZE) {
- Textbuffer* new = Textbuffer_new();
- if (!new)
- return -1;
- new->next = self;
- self->prev = new;
- *this = self = new;
- }
- self->data[self->size++] = code;
- return 0;
-}
-
-/*
- Return the contents of the textbuffer as a Python Unicode object.
-*/
-static PyObject* Textbuffer_render(Textbuffer* self)
-{
- PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
- PyObject *left, *concat;
-
- while (self->next) {
- self = self->next;
- left = PyUnicode_FromUnicode(self->data, self->size);
- concat = PyUnicode_Concat(left, result);
- Py_DECREF(left);
- Py_DECREF(result);
- result = concat;
- }
- return result;
-}
-
-static TagData* TagData_new(void)
-{
- TagData *self = malloc(sizeof(TagData));
-
- #define ALLOC_BUFFER(name) \
- name = Textbuffer_new(); \
- if (!name) { \
- TagData_dealloc(self); \
- return NULL; \
- }
-
- if (!self) {
- PyErr_NoMemory();
- return NULL;
- }
- self->context = TAG_NAME;
- ALLOC_BUFFER(self->pad_first)
- ALLOC_BUFFER(self->pad_before_eq)
- ALLOC_BUFFER(self->pad_after_eq)
- self->quoter = self->reset = 0;
- return self;
-}
-
-static void TagData_dealloc(TagData* self)
-{
- #define DEALLOC_BUFFER(name) \
- if (name) \
- Textbuffer_dealloc(name);
-
- DEALLOC_BUFFER(self->pad_first);
- DEALLOC_BUFFER(self->pad_before_eq);
- DEALLOC_BUFFER(self->pad_after_eq);
- free(self);
-}
-
-static int TagData_reset_buffers(TagData* self)
-{
- #define RESET_BUFFER(name) \
- Textbuffer_dealloc(name); \
- name = Textbuffer_new(); \
- if (!name) \
- return -1;
-
- RESET_BUFFER(self->pad_first)
- RESET_BUFFER(self->pad_before_eq)
- RESET_BUFFER(self->pad_after_eq)
- return 0;
-}
-
-static PyObject*
-Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
-{
- Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
- return (PyObject*) self;
-}
-
-static void Tokenizer_dealloc(Tokenizer* self)
-{
- Stack *this = self->topstack, *next;
- Py_XDECREF(self->text);
-
- while (this) {
- Py_DECREF(this->stack);
- Textbuffer_dealloc(this->textbuffer);
- next = this->next;
- free(this);
- this = next;
- }
- Py_TYPE(self)->tp_free((PyObject*) self);
-}
-
-static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
-{
- static char* kwlist[] = {NULL};
-
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
- return -1;
- self->text = Py_None;
- Py_INCREF(Py_None);
- self->topstack = NULL;
- self->head = self->length = self->global = self->depth = self->cycles = 0;
- return 0;
-}
-
-/*
- Add a new token stack, context, and textbuffer to the list.
-*/
-static int Tokenizer_push(Tokenizer* self, uint64_t context)
-{
- Stack* top = malloc(sizeof(Stack));
-
- if (!top) {
- PyErr_NoMemory();
- return -1;
- }
- top->stack = PyList_New(0);
- top->context = context;
- top->textbuffer = Textbuffer_new();
- if (!top->textbuffer)
- return -1;
- top->next = self->topstack;
- self->topstack = top;
- self->depth++;
- self->cycles++;
- return 0;
-}
-
-/*
- Push the textbuffer onto the stack as a Text node and clear it.
-*/
-static int Tokenizer_push_textbuffer(Tokenizer* self)
-{
- PyObject *text, *kwargs, *token;
- Textbuffer* buffer = self->topstack->textbuffer;
-
- if (buffer->size == 0 && !buffer->next)
- return 0;
- text = Textbuffer_render(buffer);
- if (!text)
- return -1;
- kwargs = PyDict_New();
- if (!kwargs) {
- Py_DECREF(text);
- return -1;
- }
- PyDict_SetItemString(kwargs, "text", text);
- Py_DECREF(text);
- token = PyObject_Call(Text, NOARGS, kwargs);
- Py_DECREF(kwargs);
- if (!token)
- return -1;
- if (PyList_Append(self->topstack->stack, token)) {
- Py_DECREF(token);
- return -1;
- }
- Py_DECREF(token);
- Textbuffer_dealloc(buffer);
- self->topstack->textbuffer = Textbuffer_new();
- if (!self->topstack->textbuffer)
- return -1;
- return 0;
-}
-
-/*
- Pop and deallocate the top token stack/context/textbuffer.
-*/
-static void Tokenizer_delete_top_of_stack(Tokenizer* self)
-{
- Stack* top = self->topstack;
-
- Py_DECREF(top->stack);
- Textbuffer_dealloc(top->textbuffer);
- self->topstack = top->next;
- free(top);
- self->depth--;
-}
-
-/*
- Pop the current stack/context/textbuffer, returing the stack.
-*/
-static PyObject* Tokenizer_pop(Tokenizer* self)
-{
- PyObject* stack;
-
- if (Tokenizer_push_textbuffer(self))
- return NULL;
- stack = self->topstack->stack;
- Py_INCREF(stack);
- Tokenizer_delete_top_of_stack(self);
- return stack;
-}
-
-/*
- Pop the current stack/context/textbuffer, returing the stack. We will also
- replace the underlying stack's context with the current stack's.
-*/
-static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
-{
- PyObject* stack;
- uint64_t context;
-
- if (Tokenizer_push_textbuffer(self))
- return NULL;
- stack = self->topstack->stack;
- Py_INCREF(stack);
- context = self->topstack->context;
- Tokenizer_delete_top_of_stack(self);
- self->topstack->context = context;
- return stack;
-}
-
-/*
- Fail the current tokenization route. Discards the current
- stack/context/textbuffer and sets the BAD_ROUTE flag.
-*/
-static void* Tokenizer_fail_route(Tokenizer* self)
-{
- uint64_t context = self->topstack->context;
- PyObject* stack = Tokenizer_pop(self);
-
- Py_XDECREF(stack);
- FAIL_ROUTE(context);
- return NULL;
-}
-
-/*
- Write a token to the current token stack.
-*/
-static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
-{
- PyObject* instance;
-
- if (Tokenizer_push_textbuffer(self))
- return -1;
- instance = PyObject_CallObject(token, NULL);
- if (!instance)
- return -1;
- if (first ? PyList_Insert(self->topstack->stack, 0, instance) :
- PyList_Append(self->topstack->stack, instance)) {
- Py_DECREF(instance);
- return -1;
- }
- Py_DECREF(instance);
- return 0;
-}
-
-/*
- Write a token to the current token stack, with kwargs. Steals a reference
- to kwargs.
-*/
-static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
- PyObject* kwargs, int first)
-{
- PyObject* instance;
-
- if (Tokenizer_push_textbuffer(self)) {
- Py_DECREF(kwargs);
- return -1;
- }
- instance = PyObject_Call(token, NOARGS, kwargs);
- if (!instance) {
- Py_DECREF(kwargs);
- return -1;
- }
- if (first ? PyList_Insert(self->topstack->stack, 0, instance):
- PyList_Append(self->topstack->stack, instance)) {
- Py_DECREF(instance);
- Py_DECREF(kwargs);
- return -1;
- }
- Py_DECREF(instance);
- Py_DECREF(kwargs);
- return 0;
-}
-
-/*
- Write a Unicode codepoint to the current textbuffer.
-*/
-static int Tokenizer_emit_char(Tokenizer* self, Py_UNICODE code)
-{
- return Textbuffer_write(&(self->topstack->textbuffer), code);
-}
-
-/*
- Write a string of text to the current textbuffer.
-*/
-static int Tokenizer_emit_text(Tokenizer* self, const char* text)
-{
- int i = 0;
-
- while (text[i]) {
- if (Tokenizer_emit_char(self, text[i]))
- return -1;
- i++;
- }
- return 0;
-}
-
-/*
- Write the contents of another textbuffer to the current textbuffer,
- deallocating it in the process.
-*/
-static int
-Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse)
-{
- Textbuffer *original = buffer;
- long i;
-
- if (reverse) {
- do {
- for (i = buffer->size - 1; i >= 0; i--) {
- if (Tokenizer_emit_char(self, buffer->data[i])) {
- Textbuffer_dealloc(original);
- return -1;
- }
- }
- } while ((buffer = buffer->next));
- }
- else {
- while (buffer->next)
- buffer = buffer->next;
- do {
- for (i = 0; i < buffer->size; i++) {
- if (Tokenizer_emit_char(self, buffer->data[i])) {
- Textbuffer_dealloc(original);
- return -1;
- }
- }
- } while ((buffer = buffer->prev));
- }
- Textbuffer_dealloc(original);
- return 0;
-}
-
-/*
- Write a series of tokens to the current stack at once.
-*/
-static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
-{
- int pushed = 0;
- PyObject *stack, *token, *left, *right, *text;
- Textbuffer* buffer;
- Py_ssize_t size;
-
- if (PyList_GET_SIZE(tokenlist) > 0) {
- token = PyList_GET_ITEM(tokenlist, 0);
- switch (PyObject_IsInstance(token, Text)) {
- case 0:
- break;
- case 1: {
- pushed = 1;
- buffer = self->topstack->textbuffer;
- if (buffer->size == 0 && !buffer->next)
- break;
- left = Textbuffer_render(buffer);
- if (!left)
- return -1;
- right = PyObject_GetAttrString(token, "text");
- if (!right)
- return -1;
- text = PyUnicode_Concat(left, right);
- Py_DECREF(left);
- Py_DECREF(right);
- if (!text)
- return -1;
- if (PyObject_SetAttrString(token, "text", text)) {
- Py_DECREF(text);
- return -1;
- }
- Py_DECREF(text);
- Textbuffer_dealloc(buffer);
- self->topstack->textbuffer = Textbuffer_new();
- if (!self->topstack->textbuffer)
- return -1;
- break;
- }
- case -1:
- return -1;
- }
- }
- if (!pushed) {
- if (Tokenizer_push_textbuffer(self))
- return -1;
- }
- stack = self->topstack->stack;
- size = PyList_GET_SIZE(stack);
- if (PyList_SetSlice(stack, size, size, tokenlist))
- return -1;
- return 0;
-}
-
-/*
- Pop the current stack, write text, and then write the stack. 'text' is a
- NULL-terminated array of chars.
-*/
-static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
-{
- PyObject* stack = Tokenizer_pop(self);
-
- if (Tokenizer_emit_text(self, text)) {
- Py_DECREF(stack);
- return -1;
- }
- if (stack) {
- if (PyList_GET_SIZE(stack) > 0) {
- if (Tokenizer_emit_all(self, stack)) {
- Py_DECREF(stack);
- return -1;
- }
- }
- Py_DECREF(stack);
- }
- self->head--;
- return 0;
-}
-
-/*
- Read the value at a relative point in the wikicode, forwards.
-*/
-static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
-{
- Py_ssize_t index = self->head + delta;
-
- if (index >= self->length)
- return EMPTY;
- return PyList_GET_ITEM(self->text, index);
-}
-
-/*
- Read the value at a relative point in the wikicode, backwards.
-*/
-static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
-{
- Py_ssize_t index;
-
- if (delta > self->head)
- return EMPTY;
- index = self->head - delta;
- return PyList_GET_ITEM(self->text, index);
-}
-
/*
Parse a template at the head of the wikicode string.
*/
@@ -651,7 +189,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
PyObject *tokenlist;
self->head += 2;
- while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) {
+ while (Tokenizer_read(self, 0) == '{' && braces < MAX_BRACES) {
self->head++;
braces++;
}
@@ -882,21 +420,21 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
Textbuffer* buffer;
PyObject* scheme;
- Py_UNICODE this;
+ Unicode this;
int slashes, i;
if (Tokenizer_push(self, LC_EXT_LINK_URI))
return -1;
- if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') {
+ if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') {
if (Tokenizer_emit_text(self, "//"))
return -1;
self->head += 2;
}
else {
- buffer = Textbuffer_new();
+ buffer = Textbuffer_new(&self->text);
if (!buffer)
return -1;
- while ((this = Tokenizer_READ(self, 0))) {
+ while ((this = Tokenizer_read(self, 0))) {
i = 0;
while (1) {
if (!valid[i])
@@ -905,7 +443,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
break;
i++;
}
- Textbuffer_write(&buffer, this);
+ Textbuffer_write(buffer, this);
if (Tokenizer_emit_char(self, this)) {
Textbuffer_dealloc(buffer);
return -1;
@@ -923,8 +461,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
return -1;
}
self->head++;
- slashes = (Tokenizer_READ(self, 0) == '/' &&
- Tokenizer_READ(self, 1) == '/');
+ slashes = (Tokenizer_read(self, 0) == '/' &&
+ Tokenizer_read(self, 1) == '/');
if (slashes) {
if (Tokenizer_emit_text(self, "//")) {
Textbuffer_dealloc(buffer);
@@ -936,7 +474,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
Textbuffer_dealloc(buffer);
if (!scheme)
return -1;
- if (!IS_SCHEME(scheme, slashes, 0)) {
+ if (!IS_SCHEME(scheme, slashes)) {
Py_DECREF(scheme);
Tokenizer_fail_route(self);
return 0;
@@ -952,46 +490,40 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
{
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
- Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
+ Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
PyObject *scheme;
- Py_UNICODE chunk;
- long i;
+ Unicode chunk;
+ Py_ssize_t i;
int slashes, j;
if (!scheme_buffer)
return -1;
// We have to backtrack through the textbuffer looking for our scheme since
// it was just parsed as text:
- temp_buffer = self->topstack->textbuffer;
- while (temp_buffer) {
- for (i = temp_buffer->size - 1; i >= 0; i--) {
- chunk = temp_buffer->data[i];
- if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
- goto end_of_loop;
- j = 0;
- while (1) {
- if (!valid[j]) {
- Textbuffer_dealloc(scheme_buffer);
- FAIL_ROUTE(0);
- return 0;
- }
- if (chunk == valid[j])
- break;
- j++;
+ for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
+ chunk = Textbuffer_read(self->topstack->textbuffer, i);
+ if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
+ goto end_of_loop;
+ j = 0;
+ do {
+ if (!valid[j]) {
+ Textbuffer_dealloc(scheme_buffer);
+ FAIL_ROUTE(0);
+ return 0;
}
- Textbuffer_write(&scheme_buffer, chunk);
- }
- temp_buffer = temp_buffer->next;
+ } while (chunk != valid[j++]);
+ Textbuffer_write(scheme_buffer, chunk);
}
end_of_loop:
+ Textbuffer_reverse(scheme_buffer);
scheme = Textbuffer_render(scheme_buffer);
if (!scheme) {
Textbuffer_dealloc(scheme_buffer);
return -1;
}
- slashes = (Tokenizer_READ(self, 0) == '/' &&
- Tokenizer_READ(self, 1) == '/');
- if (!IS_SCHEME(scheme, slashes, 1)) {
+ slashes = (Tokenizer_read(self, 0) == '/' &&
+ Tokenizer_read(self, 1) == '/');
+ if (!IS_SCHEME(scheme, slashes)) {
Py_DECREF(scheme);
Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0);
@@ -1002,7 +534,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
Textbuffer_dealloc(scheme_buffer);
return -1;
}
- if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1))
+ if (Tokenizer_emit_textbuffer(self, scheme_buffer))
return -1;
if (Tokenizer_emit_char(self, ':'))
return -1;
@@ -1017,29 +549,27 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
/*
Handle text in a free external link, including trailing punctuation.
*/
-static int
-Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
- Textbuffer** tail, Py_UNICODE this)
+static int Tokenizer_handle_free_link_text(
+ Tokenizer* self, int* parens, Textbuffer* tail, Unicode this)
{
- #define PUSH_TAIL_BUFFER(tail, error) \
- if ((tail)->size || (tail)->next) { \
- if (Tokenizer_emit_textbuffer(self, tail, 0)) \
- return error; \
- tail = Textbuffer_new(); \
- if (!(tail)) \
- return error; \
+ #define PUSH_TAIL_BUFFER(tail, error) \
+ if (tail->length > 0) { \
+ if (Textbuffer_concat(self->topstack->textbuffer, tail)) \
+ return error; \
+ if (Textbuffer_reset(tail)) \
+ return error; \
}
if (this == '(' && !(*parens)) {
*parens = 1;
- PUSH_TAIL_BUFFER(*tail, -1)
+ PUSH_TAIL_BUFFER(tail, -1)
}
else if (this == ',' || this == ';' || this == '\\' || this == '.' ||
this == ':' || this == '!' || this == '?' ||
(!(*parens) && this == ')'))
return Textbuffer_write(tail, this);
else
- PUSH_TAIL_BUFFER(*tail, -1)
+ PUSH_TAIL_BUFFER(tail, -1)
return Tokenizer_emit_char(self, this);
}
@@ -1047,10 +577,10 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
Return whether the current head is the end of a free link.
*/
static int
-Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
+Tokenizer_is_free_link(Tokenizer* self, Unicode this, Unicode next)
{
// Built from Tokenizer_parse()'s end sentinels:
- Py_UNICODE after = Tokenizer_READ(self, 2);
+ Unicode after = Tokenizer_read(self, 2);
uint64_t ctx = self->topstack->context;
return (!this || this == '\n' || this == '[' || this == ']' ||
@@ -1066,9 +596,9 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
*/
static PyObject*
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
- Textbuffer** extra)
+ Textbuffer* extra)
{
- Py_UNICODE this, next;
+ Unicode this, next;
int parens = 0;
if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) :
@@ -1076,23 +606,23 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
return NULL;
if (BAD_ROUTE)
return NULL;
- this = Tokenizer_READ(self, 0);
+ this = Tokenizer_read(self, 0);
if (!this || this == '\n' || this == ' ' || this == ']')
return Tokenizer_fail_route(self);
if (!brackets && this == '[')
return Tokenizer_fail_route(self);
while (1) {
- this = Tokenizer_READ(self, 0);
- next = Tokenizer_READ(self, 1);
+ this = Tokenizer_read(self, 0);
+ next = Tokenizer_read(self, 1);
if (this == '&') {
- PUSH_TAIL_BUFFER(*extra, NULL)
+ PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_entity(self))
return NULL;
}
else if (this == '<' && next == '!'
- && Tokenizer_READ(self, 2) == '-'
- && Tokenizer_READ(self, 3) == '-') {
- PUSH_TAIL_BUFFER(*extra, NULL)
+ && Tokenizer_read(self, 2) == '-'
+ && Tokenizer_read(self, 3) == '-') {
+ PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_comment(self))
return NULL;
}
@@ -1103,7 +633,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
else if (!this || this == '\n')
return Tokenizer_fail_route(self);
else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) {
- PUSH_TAIL_BUFFER(*extra, NULL)
+ PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_template_or_argument(self))
return NULL;
}
@@ -1143,7 +673,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"),
*split, *scheme;
Py_ssize_t length;
- Textbuffer* temp;
if (!text)
return -1;
@@ -1152,19 +681,9 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
if (!split)
return -1;
scheme = PyList_GET_ITEM(split, 0);
- length = PyUnicode_GET_SIZE(scheme);
- while (length) {
- temp = self->topstack->textbuffer;
- if (length <= temp->size) {
- temp->size -= length;
- break;
- }
- length -= temp->size;
- self->topstack->textbuffer = temp->next;
- free(temp->data);
- free(temp);
- }
+ length = PyUnicode_GET_LENGTH(scheme);
Py_DECREF(split);
+ self->topstack->textbuffer->length -= length;
return 0;
}
@@ -1177,20 +696,20 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
#define NOT_A_LINK \
if (!brackets && self->topstack->context & LC_DLTERM) \
return Tokenizer_handle_dl_term(self); \
- return Tokenizer_emit_char(self, Tokenizer_READ(self, 0))
+ return Tokenizer_emit_char(self, Tokenizer_read(self, 0))
Py_ssize_t reset = self->head;
PyObject *link, *kwargs;
- Textbuffer *extra = 0;
+ Textbuffer *extra;
if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
NOT_A_LINK;
}
- extra = Textbuffer_new();
+ extra = Textbuffer_new(&self->text);
if (!extra)
return -1;
self->head++;
- link = Tokenizer_really_parse_external_link(self, brackets, &extra);
+ link = Tokenizer_really_parse_external_link(self, brackets, extra);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
@@ -1230,8 +749,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
Textbuffer_dealloc(extra);
return -1;
}
- if (extra->size || extra->next)
- return Tokenizer_emit_textbuffer(self, extra, 0);
+ if (extra->length > 0)
+ return Tokenizer_emit_textbuffer(self, extra);
Textbuffer_dealloc(extra);
return 0;
}
@@ -1248,7 +767,7 @@ static int Tokenizer_parse_heading(Tokenizer* self)
self->global |= GL_HEADING;
self->head += 1;
- while (Tokenizer_READ(self, 0) == '=') {
+ while (Tokenizer_read(self, 0) == '=') {
best++;
self->head++;
}
@@ -1264,7 +783,11 @@ static int Tokenizer_parse_heading(Tokenizer* self)
self->global ^= GL_HEADING;
return 0;
}
- level = NEW_INT_FUNC(heading->level);
+#ifdef IS_PY3K
+ level = PyLong_FromSsize_t(heading->level);
+#else
+ level = PyInt_FromSsize_t(heading->level);
+#endif
if (!level) {
Py_DECREF(heading->title);
free(heading);
@@ -1319,7 +842,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
self->head += 1;
best = 1;
- while (Tokenizer_READ(self, 0) == '=') {
+ while (Tokenizer_read(self, 0) == '=') {
best++;
self->head++;
}
@@ -1373,8 +896,8 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
*/
static int Tokenizer_really_parse_entity(Tokenizer* self)
{
- PyObject *kwargs, *textobj;
- Py_UNICODE this;
+ PyObject *kwargs, *charobj, *textobj;
+ Unicode this;
int numeric, hexadecimal, i, j, zeroes, test;
char *valid, *text, *buffer, *def;
@@ -1387,7 +910,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
if (Tokenizer_emit(self, HTMLEntityStart))
return -1;
self->head++;
- this = Tokenizer_READ(self, 0);
+ this = Tokenizer_read(self, 0);
if (!this) {
Tokenizer_fail_route(self);
return 0;
@@ -1397,7 +920,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
if (Tokenizer_emit(self, HTMLEntityNumeric))
return -1;
self->head++;
- this = Tokenizer_READ(self, 0);
+ this = Tokenizer_read(self, 0);
if (!this) {
Tokenizer_fail_route(self);
return 0;
@@ -1407,7 +930,12 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
kwargs = PyDict_New();
if (!kwargs)
return -1;
- PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0));
+ if (!(charobj = PyUnicode_FROM_SINGLE(this))) {
+ Py_DECREF(kwargs);
+ return -1;
+ }
+ PyDict_SetItemString(kwargs, "char", charobj);
+ Py_DECREF(charobj);
if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs))
return -1;
self->head++;
@@ -1431,7 +959,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
i = 0;
zeroes = 0;
while (1) {
- this = Tokenizer_READ(self, 0);
+ this = Tokenizer_read(self, 0);
if (this == ';') {
if (i == 0)
FAIL_ROUTE_AND_EXIT()
@@ -1544,21 +1072,21 @@ static int Tokenizer_parse_comment(Tokenizer* self)
{
Py_ssize_t reset = self->head + 3;
PyObject *comment;
- Py_UNICODE this;
+ Unicode this;
self->head += 4;
if (Tokenizer_push(self, 0))
return -1;
while (1) {
- this = Tokenizer_READ(self, 0);
+ this = Tokenizer_read(self, 0);
if (!this) {
comment = Tokenizer_pop(self);
Py_XDECREF(comment);
self->head = reset;
return Tokenizer_emit_text(self, "