From f0cacd5a9e56ec8ba195c26fcc198d517230eaaf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 24 May 2015 00:11:32 -0400 Subject: [PATCH 01/40] Version bump to 0.4.1; fix broken sed commands in release.sh. --- CHANGELOG | 2 +- docs/changelog.rst | 4 ++-- mwparserfromhell/__init__.py | 2 +- scripts/release.sh | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 0ab103a..05e5423 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -v0.4 (unreleased): +v0.4 (released May 23, 2015): - The parser now falls back on pure Python mode if C extensions cannot be built. This fixes an issue that prevented some Windows users from installing diff --git a/docs/changelog.rst b/docs/changelog.rst index 9811b5c..5607c59 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,8 +4,8 @@ Changelog v0.4 ---- -Unreleased -(`changes `__): +`Released May 23, 2015 `_ +(`changes `__): - The parser now falls back on pure Python mode if C extensions cannot be built. This fixes an issue that prevented some Windows users from installing diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 94b6e03..a48a0e8 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.4" +__version__ = "0.4.1.dev0" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin, diff --git a/scripts/release.sh b/scripts/release.sh index 4becf1a..dcd871c 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -34,7 +34,7 @@ update_version() { update_changelog() { filename="CHANGELOG" echo -n "Updating $filename..." - sed -e '1s/.*/v'$VERSION' (released '$RELEASE_DATE'):/' -i "" $filename + sed -e "1s/.*/v$VERSION (released $RELEASE_DATE):/" -i "" $filename echo " done." } @@ -45,10 +45,10 @@ update_docs_changelog() { previous_lineno=$(expr $(grep -n -e "^---" $filename | sed '2q;d' | cut -d ':' -f 1) - 1) previous_version=$(sed $previous_lineno'q;d' $filename) sed \ - -e '4s/.*/v'$VERSION \ - -e '5s/.*/'$dashes \ - -e '7s/.*/`Released '$RELEASE_DATE' `_/' \ - -e '8s/.*/(`changes `__):/' \ + -e "4s/.*/v$VERSION/" \ + -e "5s/.*/$dashes/" \ + -e "7s/.*/\`Released $RELEASE_DATE \`_/" \ + -e "8s/.*/(\`changes \`__):/" \ -i "" $filename echo " done." } From 28faf72a84f4c98ef1e42d39591b7edd31419638 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 24 May 2015 00:20:13 -0400 Subject: [PATCH 02/40] Update the changelogs; sync README with docs on new Windows builds. --- CHANGELOG | 7 +++++++ README.rst | 1 - docs/changelog.rst | 11 +++++++++++ docs/index.rst | 18 ++++++++---------- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 05e5423..2a65157 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,10 @@ +v0.4.1 (unreleased): + +- The process for building Windows binaries has been fixed, and these should be + distributed along with new releases. Windows users can now take advantage of + C speedups without having a compiler of their own. +- Fixed some bugs in the release scripts. + v0.4 (released May 23, 2015): - The parser now falls back on pure Python mode if C extensions cannot be diff --git a/README.rst b/README.rst index 45c7286..7e3e68d 100644 --- a/README.rst +++ b/README.rst @@ -156,7 +156,6 @@ If you're not using a library, you can parse any page using the following code .. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm .. _GitHub: https://github.com/earwig/mwparserfromhell .. _Python Package Index: http://pypi.python.org -.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat .. _get pip: http://pypi.python.org/pypi/pip .. _EarwigBot: https://github.com/earwig/earwigbot .. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot diff --git a/docs/changelog.rst b/docs/changelog.rst index 5607c59..9d49f60 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,17 @@ Changelog ========= +v0.4.1 +------ + +Unreleased +(`changes `__): + +- The process for building Windows binaries has been fixed, and these should be + distributed along with new releases. Windows users can now take advantage of + C speedups without having a compiler of their own. +- Fixed some bugs in the release scripts. + v0.4 ---- diff --git a/docs/index.rst b/docs/index.rst index 988f5e7..9a6c8ab 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,24 +17,22 @@ Development occurs on GitHub_. Installation ------------ -The easiest way to install the parser is through the `Python Package Index`_, -so you can install the latest release with ``pip install mwparserfromhell`` -(`get pip`_). Alternatively, get the latest development version:: +The easiest way to install the parser is through the `Python Package Index`_; +you can install the latest release with ``pip install mwparserfromhell`` +(`get pip`_). On Windows, make sure you have the latest version of pip +installed by running ``pip install --upgrade pip``. + +Alternatively, get the latest development version:: git clone https://github.com/earwig/mwparserfromhell.git cd mwparserfromhell python setup.py install -If you get ``error: Unable to find vcvarsall.bat`` while installing, this is -because Windows can't find the compiler for C extensions. Consult this -`StackOverflow question`_ for help. You can also set ``ext_modules`` in -``setup.py`` to an empty list to prevent the extension from building. - -You can run the comprehensive unit testing suite with ``python setup.py test``. +You can run the comprehensive unit testing suite with +``python setup.py test -q``. .. _Python Package Index: http://pypi.python.org .. _get pip: http://pypi.python.org/pypi/pip -.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat Contents -------- From 04188b590e6f46fd424b7d1d335fd252530dbde7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 25 May 2015 00:59:27 -0400 Subject: [PATCH 03/40] Start work on support for Python 3.5 (for #101) --- .travis.yml | 1 + setup.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 07dab97..c805433 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - 3.2 - 3.3 - 3.4 + - 3.5 install: - pip install coveralls - python setup.py build diff --git a/setup.py b/setup.py index 5c21db5..e2744ef 100644 --- a/setup.py +++ b/setup.py @@ -36,14 +36,14 @@ from setuptools import setup, find_packages, Extension from mwparserfromhell import __version__ from mwparserfromhell.compat import py26, py3k -with open("README.rst", **{'encoding':'utf-8'} if py3k else {}) as fp: +with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp: long_docs = fp.read() tokenizer = Extension("mwparserfromhell.parser._tokenizer", sources=["mwparserfromhell/parser/tokenizer.c"], depends=["mwparserfromhell/parser/tokenizer.h"]) -use_extension=True +use_extension = True # Allow env var WITHOUT_EXTENSION and args --with[out]-extension if '--without-extension' in sys.argv: @@ -53,7 +53,7 @@ elif '--with-extension' in sys.argv: elif os.environ.get('WITHOUT_EXTENSION', '0') == '1': use_extension = False -# Remove the command line argument as it isnt understood by +# Remove the command line argument as it isn't understood by # setuptools/distutils sys.argv = [arg for arg in sys.argv if not arg.startswith('--with') @@ -126,6 +126,7 @@ optional_compile_setup( "Programming Language :: Python :: 3.2", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", "Topic :: Text Processing :: Markup" ], ) From ae2f28578aab58791f7dbde0dceee255816e4d6e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 25 May 2015 01:05:31 -0400 Subject: [PATCH 04/40] No 3.5 support in Travis, but it does have a nightly option (#101) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c805433..c09e793 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ python: - 3.2 - 3.3 - 3.4 - - 3.5 + - nightly install: - pip install coveralls - python setup.py build From 8fb5b3a6be052332f21e2343781bb32ffbfba8dd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 25 May 2015 01:12:14 -0400 Subject: [PATCH 05/40] Python 3.5 works (closes #101) [ci skip] --- CHANGELOG | 1 + docs/changelog.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 2a65157..a886bcb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,7 @@ v0.4.1 (unreleased): - The process for building Windows binaries has been fixed, and these should be distributed along with new releases. Windows users can now take advantage of C speedups without having a compiler of their own. +- Added support for Python 3.5. - Fixed some bugs in the release scripts. v0.4 (released May 23, 2015): diff --git a/docs/changelog.rst b/docs/changelog.rst index 9d49f60..e94e2f3 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -10,6 +10,7 @@ Unreleased - The process for building Windows binaries has been fixed, and these should be distributed along with new releases. Windows users can now take advantage of C speedups without having a compiler of their own. +- Added support for Python 3.5. - Fixed some bugs in the release scripts. v0.4 From 07d4577c330940bf30957829c24643000145a263 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 5 Jun 2015 00:23:53 -0400 Subject: [PATCH 06/40] Add tests for < and > in wilink titles/template names (#104) --- tests/tokenizer/integration.mwtest | 70 ++++++++++++++++++++++++++++++++++++++ tests/tokenizer/templates.mwtest | 14 ++++++++ tests/tokenizer/wikilinks.mwtest | 28 +++++++++++++++ 3 files changed, 112 insertions(+) diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 1019175..27a7d39 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -241,3 +241,73 @@ name: newline_and_comment_in_template_name_7 label: a template name containing a newline followed by a comment input: "{{foobar\nbarbaz]]" +output: [Text(text="[[foo"), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz]]")] + +--- + +name: tag_in_template_name +label: HTML tags are invalid in template names, even when complete +input: "{{foobarbaz}}" +output: [Text(text="{{foo"), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz}}")] + +--- + +name: tag_in_link_text +label: HTML tags are valid in link text +input: "[[foo|barbaz]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz"), WikilinkClose()] + +--- + +name: comment_in_link_title +label: comments are valid in link titles +input: "[[foobaz]]" +output: [WikilinkOpen(), Text(text="foo"), CommentStart(), Text(text="bar"), CommentEnd(), Text(text="baz"), WikilinkClose()] + +--- + +name: incomplete_comment_in_link_title +label: incomplete comments are invalid in link titles +input: "[[foo\nfoobar\n}}" +output: [TemplateOpen(), CommentStart(), Text(text=" comment "), CommentEnd(), Text(text="\nfoobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), TemplateClose()] + +--- + name: tag_in_link_title label: HTML tags are invalid in link titles, even when complete input: "[[foobarbaz]]" From f16c7e25cac66bb3430cbf223a44844493cda1f3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Jul 2015 02:29:59 -0400 Subject: [PATCH 27/40] Fully fix parsing templates with blank names, I hope (#111) --- CHANGELOG | 3 +++ docs/changelog.rst | 5 ++++ mwparserfromhell/parser/contexts.py | 18 +++++++------ mwparserfromhell/parser/tokenizer.c | 52 ++++++++++++++++++++++-------------- mwparserfromhell/parser/tokenizer.h | 21 ++++++++------- mwparserfromhell/parser/tokenizer.py | 35 +++++++++++++++--------- tests/tokenizer/templates.mwtest | 4 +-- 7 files changed, 85 insertions(+), 53 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index c696b98..5b5d794 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,6 +8,9 @@ v0.4.1 (unreleased): includes when denoting tags, but not comments. - Fixed the behavior of preserve_spacing in Template.add() and keep_field in Template.remove() on parameters with hidden keys. +- Fixed parser bugs involving: + - templates with completely blank names; + - templates with newlines and comments. - Fixed some bugs in the release scripts. v0.4 (released May 23, 2015): diff --git a/docs/changelog.rst b/docs/changelog.rst index 54f8af8..4e64a8b 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -15,6 +15,11 @@ Unreleased This includes when denoting tags, but not comments. - Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and *keep_field* in :func:`~.Template.remove` on parameters with hidden keys. +- Fixed parser bugs involving: + + - templates with completely blank names; + - templates with newlines and comments. + - Fixed some bugs in the release scripts. v0.4 diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index e98d8f7..b676e86 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -89,6 +89,7 @@ Local (stack-specific) contexts: * :const:`FAIL_ON_LBRACE` * :const:`FAIL_ON_RBRACE` * :const:`FAIL_ON_EQUALS` + * :const:`HAS_TEMPLATE` * :const:`TABLE` @@ -161,15 +162,16 @@ FAIL_NEXT = 1 << 26 FAIL_ON_LBRACE = 1 << 27 FAIL_ON_RBRACE = 1 << 28 FAIL_ON_EQUALS = 1 << 29 +HAS_TEMPLATE = 1 << 30 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + - FAIL_ON_RBRACE + FAIL_ON_EQUALS) - -TABLE_OPEN = 1 << 30 -TABLE_CELL_OPEN = 1 << 31 -TABLE_CELL_STYLE = 1 << 32 -TABLE_ROW_OPEN = 1 << 33 -TABLE_TD_LINE = 1 << 34 -TABLE_TH_LINE = 1 << 35 + FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE) + +TABLE_OPEN = 1 << 31 +TABLE_CELL_OPEN = 1 << 32 +TABLE_CELL_STYLE = 1 << 33 +TABLE_ROW_OPEN = 1 << 34 +TABLE_TD_LINE = 1 << 35 +TABLE_TH_LINE = 1 << 36 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + TABLE_TD_LINE + TABLE_TH_LINE) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ec0315f..f4e801b 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -582,12 +582,16 @@ static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) /* Parse a template at the head of the wikicode string. */ -static int Tokenizer_parse_template(Tokenizer* self) +static int Tokenizer_parse_template(Tokenizer* self, int has_content) { PyObject *template; Py_ssize_t reset = self->head; + uint64_t context = LC_TEMPLATE_NAME; - template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1); + if (has_content) + context |= LC_HAS_TEMPLATE; + + template = Tokenizer_parse(self, context, 1); if (BAD_ROUTE) { self->head = reset; return 0; @@ -643,6 +647,7 @@ static int Tokenizer_parse_argument(Tokenizer* self) static int Tokenizer_parse_template_or_argument(Tokenizer* self) { unsigned int braces = 2, i; + int has_content = 0; PyObject *tokenlist; self->head += 2; @@ -659,7 +664,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) return 0; } if (braces == 2) { - if (Tokenizer_parse_template(self)) + if (Tokenizer_parse_template(self, has_content)) return -1; if (BAD_ROUTE) { RESET_ROUTE(); @@ -673,7 +678,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) return -1; if (BAD_ROUTE) { RESET_ROUTE(); - if (Tokenizer_parse_template(self)) + if (Tokenizer_parse_template(self, has_content)) return -1; if (BAD_ROUTE) { char text[MAX_BRACES + 1]; @@ -689,8 +694,10 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) } else braces -= 3; - if (braces) + if (braces) { + has_content = 1; self->head++; + } } tokenlist = Tokenizer_pop(self); if (!tokenlist) @@ -712,8 +719,13 @@ static int Tokenizer_handle_template_param(Tokenizer* self) { PyObject *stack; - if (self->topstack->context & LC_TEMPLATE_NAME) + if (self->topstack->context & LC_TEMPLATE_NAME) { + if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) { + Tokenizer_fail_route(self); + return -1; + } self->topstack->context ^= LC_TEMPLATE_NAME; + } else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { @@ -764,7 +776,11 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self) { PyObject* stack; - if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { + if (self->topstack->context & LC_TEMPLATE_NAME) { + if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) + return Tokenizer_fail_route(self); + } + else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { stack = Tokenizer_pop_keeping_context(self); if (!stack) return NULL; @@ -2885,30 +2901,26 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) if (context & LC_TAG_CLOSE) return (data == '<') ? -1 : 0; if (context & LC_TEMPLATE_NAME) { - if (data == '{' || data == '}' || data == '[') { + if (data == '{') { + self->topstack->context |= LC_HAS_TEMPLATE | LC_FAIL_NEXT; + return 0; + } + if (data == '}' || (data == '<' && Tokenizer_READ(self, 1) == '!')) { self->topstack->context |= LC_FAIL_NEXT; return 0; } - if (data == ']' || data == '>' || (data == '<' && - Tokenizer_READ(self, 1) != '!')) { + if (data == '[' || data == ']' || data == '<' || data == '>') { return -1; } if (data == '|') return 0; if (context & LC_HAS_TEXT) { if (context & LC_FAIL_ON_TEXT) { - if (!Py_UNICODE_ISSPACE(data)) { - if (data == '<' && Tokenizer_READ(self, 1) == '!') { - self->topstack->context |= LC_FAIL_NEXT; - return 0; - } + if (!Py_UNICODE_ISSPACE(data)) return -1; - } - } - else { - if (data == '\n') - self->topstack->context |= LC_FAIL_ON_TEXT; } + else if (data == '\n') + self->topstack->context |= LC_FAIL_ON_TEXT; } else if (!Py_UNICODE_ISSPACE(data)) self->topstack->context |= LC_HAS_TEXT; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 842e65d..d477acb 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -150,22 +150,23 @@ static PyObject* TagCloseClose; #define LC_DLTERM 0x0000000000800000 -#define LC_SAFETY_CHECK 0x000000003F000000 +#define LC_SAFETY_CHECK 0x000000007F000000 #define LC_HAS_TEXT 0x0000000001000000 #define LC_FAIL_ON_TEXT 0x0000000002000000 #define LC_FAIL_NEXT 0x0000000004000000 #define LC_FAIL_ON_LBRACE 0x0000000008000000 #define LC_FAIL_ON_RBRACE 0x0000000010000000 #define LC_FAIL_ON_EQUALS 0x0000000020000000 - -#define LC_TABLE 0x0000000FC0000000 -#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 -#define LC_TABLE_OPEN 0x0000000040000000 -#define LC_TABLE_CELL_OPEN 0x0000000080000000 -#define LC_TABLE_CELL_STYLE 0x0000000100000000 -#define LC_TABLE_ROW_OPEN 0x0000000200000000 -#define LC_TABLE_TD_LINE 0x0000000400000000 -#define LC_TABLE_TH_LINE 0x0000000800000000 +#define LC_HAS_TEMPLATE 0x0000000040000000 + +#define LC_TABLE 0x0000001F80000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000 +#define LC_TABLE_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_OPEN 0x0000000100000000 +#define LC_TABLE_CELL_STYLE 0x0000000200000000 +#define LC_TABLE_ROW_OPEN 0x0000000400000000 +#define LC_TABLE_TD_LINE 0x0000000800000000 +#define LC_TABLE_TH_LINE 0x0000001000000000 /* Global contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 4d7d885..5c89455 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -192,11 +192,14 @@ class Tokenizer(object): self._fail_route() return self.END - def _parse_template(self): + def _parse_template(self, has_content): """Parse a template at the head of the wikicode string.""" reset = self._head + context = contexts.TEMPLATE_NAME + if has_content: + context |= contexts.HAS_TEMPLATE try: - template = self._parse(contexts.TEMPLATE_NAME) + template = self._parse(context) except BadRoute: self._head = reset raise @@ -223,6 +226,7 @@ class Tokenizer(object): while self._read() == "{": self._head += 1 braces += 1 + has_content = False self._push() while braces: @@ -230,7 +234,7 @@ class Tokenizer(object): return self._emit_text_then_stack("{") if braces == 2: try: - self._parse_template() + self._parse_template(has_content) except BadRoute: return self._emit_text_then_stack("{{") break @@ -239,11 +243,12 @@ class Tokenizer(object): braces -= 3 except BadRoute: try: - self._parse_template() + self._parse_template(has_content) braces -= 2 except BadRoute: return self._emit_text_then_stack("{" * braces) if braces: + has_content = True self._head += 1 self._emit_all(self._pop()) @@ -253,6 +258,8 @@ class Tokenizer(object): def _handle_template_param(self): """Handle a template parameter at the head of the string.""" if self._context & contexts.TEMPLATE_NAME: + if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE): + self._fail_route() self._context ^= contexts.TEMPLATE_NAME elif self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE @@ -271,7 +278,10 @@ class Tokenizer(object): def _handle_template_end(self): """Handle the end of a template at the head of the string.""" - if self._context & contexts.TEMPLATE_PARAM_KEY: + if self._context & contexts.TEMPLATE_NAME: + if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE): + self._fail_route() + elif self._context & contexts.TEMPLATE_PARAM_KEY: self._emit_all(self._pop(keep_context=True)) self._head += 1 return self._pop() @@ -1183,23 +1193,22 @@ class Tokenizer(object): elif context & contexts.EXT_LINK_TITLE: return this != "\n" elif context & contexts.TEMPLATE_NAME: - if this == "{" or this == "}" or this == "[": + if this == "{": + self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT + return True + if this == "}" or (this == "<" and self._read(1) == "!"): self._context |= contexts.FAIL_NEXT return True - if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"): + if this == "[" or this == "]" or this == "<" or this == ">": return False if this == "|": return True if context & contexts.HAS_TEXT: if context & contexts.FAIL_ON_TEXT: if this is self.END or not this.isspace(): - if this == "<" and self._read(1) == "!": - self._context |= contexts.FAIL_NEXT - return True return False - else: - if this == "\n": - self._context |= contexts.FAIL_ON_TEXT + elif this == "\n": + self._context |= contexts.FAIL_ON_TEXT elif this is self.END or not this.isspace(): self._context |= contexts.HAS_TEXT return True diff --git a/tests/tokenizer/templates.mwtest b/tests/tokenizer/templates.mwtest index 4756ac6..1913f5d 100644 --- a/tests/tokenizer/templates.mwtest +++ b/tests/tokenizer/templates.mwtest @@ -686,5 +686,5 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ name: recursion_opens_and_closes label: test potentially dangerous recursion: template openings and closings -input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}" -output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")] +input: "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}" +output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")] From 2a3a978986165431a9c192ad7ff64da897f93a6e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Jul 2015 22:55:06 -0400 Subject: [PATCH 28/40] Incomplete code for C tokenizer textbuffer. --- mwparserfromhell/parser/ctokenizer/common.h | 21 +++++++++++++++------ mwparserfromhell/parser/ctokenizer/tok_parse.c | 17 +++++------------ mwparserfromhell/parser/ctokenizer/tok_parse.h | 6 ++++++ tests/tokenizer/text.mwtest | 2 +- 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index 555cbf9..92a41ca 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -43,6 +43,18 @@ SOFTWARE. #define malloc PyObject_Malloc // XXX: yuck #define free PyObject_Free +/* Unicode support macros */ + +#if defined(IS_PY3K) && PYTHON_MINOR_VERSION >= 3 +#define PEP_393 +#endif + +#ifdef PEP_393 +#define Unicode Py_UCS4 +#else +#define Unicode Py_UNICODE +#endif + /* Error handling macros */ #define BAD_ROUTE self->route_state @@ -63,18 +75,15 @@ extern PyObject* definitions; /* Structs */ -struct Textbuffer { +typedef struct { Py_ssize_t size; Py_UNICODE* data; - struct Textbuffer* prev; - struct Textbuffer* next; -}; -typedef struct Textbuffer Textbuffer; +} Textbuffer; struct Stack { PyObject* stack; uint64_t context; - struct Textbuffer* textbuffer; + Textbuffer* textbuffer; struct Stack* next; }; typedef struct Stack Stack; diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 0aff311..81d4bce 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -30,11 +30,6 @@ SOFTWARE. #define HEXDIGITS "0123456789abcdefABCDEF" #define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" -static const char MARKERS[] = { - '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', - '-', '!', '\n', '\0'}; - -#define NUM_MARKERS 19 #define MAX_BRACES 255 #define MAX_ENTITY_SIZE 8 @@ -45,12 +40,6 @@ static const char MARKERS[] = { #define IS_SCHEME(scheme, slashes, reverse) \ (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) -#ifdef IS_PY3K - #define NEW_INT_FUNC PyLong_FromSsize_t -#else - #define NEW_INT_FUNC PyInt_FromSsize_t -#endif - typedef struct { PyObject* title; int level; @@ -798,7 +787,11 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } - level = NEW_INT_FUNC(heading->level); +#ifdef IS_PY3K + level = PyLong_FromSsize_t(heading->level); +#else + level = PyInt_FromSsize_t(heading->level); +#endif if (!level) { Py_DECREF(heading->title); free(heading); diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.h b/mwparserfromhell/parser/ctokenizer/tok_parse.h index 79e4acf..0899a34 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.h +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.h @@ -24,6 +24,12 @@ SOFTWARE. #include "common.h" +static const char MARKERS[] = { + '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', + '-', '!', '\n', '\0'}; + +#define NUM_MARKERS 19 + /* Functions */ PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); diff --git a/tests/tokenizer/text.mwtest b/tests/tokenizer/text.mwtest index 040c677..95bea6f 100644 --- a/tests/tokenizer/text.mwtest +++ b/tests/tokenizer/text.mwtest @@ -27,6 +27,6 @@ output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")] --- name: large -label: a lot of text, requiring multiple textbuffer blocks in the C tokenizer +label: a lot of text, requiring proper storage in the C tokenizer input: "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN" output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")] From 2072a10b67ca7da7bd21c5306108be28eaae57c1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 27 Jul 2015 03:57:16 -0400 Subject: [PATCH 29/40] More reworking of CTokenizer Unicode support (incomplete) --- mwparserfromhell/parser/ctokenizer/common.h | 20 +++- mwparserfromhell/parser/ctokenizer/tok_parse.c | 120 ++++++++++++----------- mwparserfromhell/parser/ctokenizer/tok_support.c | 24 +++-- mwparserfromhell/parser/ctokenizer/tok_support.h | 10 +- mwparserfromhell/parser/ctokenizer/tokenizer.c | 73 +++++++++++--- 5 files changed, 160 insertions(+), 87 deletions(-) diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index 92a41ca..55d3906 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -51,8 +51,12 @@ SOFTWARE. #ifdef PEP_393 #define Unicode Py_UCS4 +#define PyUnicode_FROM_SINGLE(chr) \ + PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) #else #define Unicode Py_UNICODE +#define PyUnicode_FROM_SINGLE(chr) \ + PyUnicode_FromUnicode(&(chr), 1) #endif /* Error handling macros */ @@ -77,7 +81,7 @@ extern PyObject* definitions; typedef struct { Py_ssize_t size; - Py_UNICODE* data; + Unicode* data; } Textbuffer; struct Stack { @@ -89,11 +93,21 @@ struct Stack { typedef struct Stack Stack; typedef struct { + PyObject* object; /* base PyUnicodeObject object */ + Py_ssize_t length; /* length of object, in code points */ +#ifdef PEP_393 + int kind; /* object's kind value */ + void* data; /* object's raw unicode buffer */ +#else + Py_UNICODE* buf; /* object's internal buffer */ +#endif +} TokenizerInput; + +typedef struct { PyObject_HEAD - PyObject* text; /* text to tokenize */ + TokenizerInput text; /* text to tokenize */ Stack* topstack; /* topmost stack */ Py_ssize_t head; /* current position in text */ - Py_ssize_t length; /* length of text */ int global; /* global context */ int depth; /* stack recursion depth */ int cycles; /* total number of stack recursions */ diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index d761e27..712e248 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -190,7 +190,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) PyObject *tokenlist; self->head += 2; - while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) { + while (Tokenizer_read(self, 0) == '{' && braces < MAX_BRACES) { self->head++; braces++; } @@ -426,7 +426,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) if (Tokenizer_push(self, LC_EXT_LINK_URI)) return -1; - if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') { + if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') { if (Tokenizer_emit_text(self, "//")) return -1; self->head += 2; @@ -435,7 +435,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) buffer = Textbuffer_new(); if (!buffer) return -1; - while ((this = Tokenizer_READ(self, 0))) { + while ((this = Tokenizer_read(self, 0))) { i = 0; while (1) { if (!valid[i]) @@ -462,8 +462,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) return -1; } self->head++; - slashes = (Tokenizer_READ(self, 0) == '/' && - Tokenizer_READ(self, 1) == '/'); + slashes = (Tokenizer_read(self, 0) == '/' && + Tokenizer_read(self, 1) == '/'); if (slashes) { if (Tokenizer_emit_text(self, "//")) { Textbuffer_dealloc(buffer); @@ -528,8 +528,8 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - slashes = (Tokenizer_READ(self, 0) == '/' && - Tokenizer_READ(self, 1) == '/'); + slashes = (Tokenizer_read(self, 0) == '/' && + Tokenizer_read(self, 1) == '/'); if (!IS_SCHEME(scheme, slashes, 1)) { Py_DECREF(scheme); Textbuffer_dealloc(scheme_buffer); @@ -589,7 +589,7 @@ static int Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) { // Built from Tokenizer_parse()'s end sentinels: - Py_UNICODE after = Tokenizer_READ(self, 2); + Py_UNICODE after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || @@ -615,22 +615,22 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, return NULL; if (BAD_ROUTE) return NULL; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this || this == '\n' || this == ' ' || this == ']') return Tokenizer_fail_route(self); if (!brackets && this == '[') return Tokenizer_fail_route(self); while (1) { - this = Tokenizer_READ(self, 0); - next = Tokenizer_READ(self, 1); + this = Tokenizer_read(self, 0); + next = Tokenizer_read(self, 1); if (this == '&') { PUSH_TAIL_BUFFER(*extra, NULL) if (Tokenizer_parse_entity(self)) return NULL; } else if (this == '<' && next == '!' - && Tokenizer_READ(self, 2) == '-' - && Tokenizer_READ(self, 3) == '-') { + && Tokenizer_read(self, 2) == '-' + && Tokenizer_read(self, 3) == '-') { PUSH_TAIL_BUFFER(*extra, NULL) if (Tokenizer_parse_comment(self)) return NULL; @@ -716,7 +716,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) #define NOT_A_LINK \ if (!brackets && self->topstack->context & LC_DLTERM) \ return Tokenizer_handle_dl_term(self); \ - return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + return Tokenizer_emit_char(self, Tokenizer_read(self, 0)) Py_ssize_t reset = self->head; PyObject *link, *kwargs; @@ -787,7 +787,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global |= GL_HEADING; self->head += 1; - while (Tokenizer_READ(self, 0) == '=') { + while (Tokenizer_read(self, 0) == '=') { best++; self->head++; } @@ -862,7 +862,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) self->head += 1; best = 1; - while (Tokenizer_READ(self, 0) == '=') { + while (Tokenizer_read(self, 0) == '=') { best++; self->head++; } @@ -916,7 +916,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) */ static int Tokenizer_really_parse_entity(Tokenizer* self) { - PyObject *kwargs, *textobj; + PyObject *kwargs, *charobj, *textobj; Py_UNICODE this; int numeric, hexadecimal, i, j, zeroes, test; char *valid, *text, *buffer, *def; @@ -930,7 +930,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) if (Tokenizer_emit(self, HTMLEntityStart)) return -1; self->head++; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { Tokenizer_fail_route(self); return 0; @@ -940,7 +940,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) if (Tokenizer_emit(self, HTMLEntityNumeric)) return -1; self->head++; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { Tokenizer_fail_route(self); return 0; @@ -950,7 +950,12 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) kwargs = PyDict_New(); if (!kwargs) return -1; - PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0)); + if (!(charobj = PyUnicode_FROM_SINGLE(this))) { + Py_DECREF(kwargs); + return -1; + } + PyDict_SetItemString(kwargs, "char", charobj); + Py_DECREF(charobj); if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs)) return -1; self->head++; @@ -974,7 +979,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) i = 0; zeroes = 0; while (1) { - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (this == ';') { if (i == 0) FAIL_ROUTE_AND_EXIT() @@ -1093,15 +1098,15 @@ static int Tokenizer_parse_comment(Tokenizer* self) if (Tokenizer_push(self, 0)) return -1; while (1) { - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { comment = Tokenizer_pop(self); Py_XDECREF(comment); self->head = reset; return Tokenizer_emit_text(self, "