diff --git a/.travis.yml b/.travis.yml index 07dab97..7095d21 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,8 @@ python: - 3.2 - 3.3 - 3.4 + - 3.5-dev +sudo: false install: - pip install coveralls - python setup.py build diff --git a/CHANGELOG b/CHANGELOG index 0ab103a..921bc4f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,27 @@ -v0.4 (unreleased): +v0.4.1 (released July 30, 2015): + +- The process for building Windows binaries has been fixed, and these should be + distributed along with new releases. Windows users can now take advantage of + C speedups without having a compiler of their own. +- Added support for Python 3.5. +- '<' and '>' are now disallowed in wikilink titles and template names. This + includes when denoting tags, but not comments. +- Fixed the behavior of preserve_spacing in Template.add() and keep_field in + Template.remove() on parameters with hidden keys. +- Removed _ListProxy.detach(). SmartLists now use weak references and their + children are garbage-collected properly. +- Fixed parser bugs involving: + - templates with completely blank names; + - templates with newlines and comments. +- Heavy refactoring and fixes to the C tokenizer, including: + - corrected a design flaw in text handling, allowing for substantial speed + improvements when parsing long strings of plain text; + - implemented new Python 3.3 PEP 393 Unicode APIs. +- Fixed various bugs in SmartList, including one that was causing memory issues + on 64-bit builds of Python 2 on Windows. +- Fixed some bugs in the release scripts. + +v0.4 (released May 23, 2015): - The parser now falls back on pure Python mode if C extensions cannot be built. This fixes an issue that prevented some Windows users from installing diff --git a/README.rst b/README.rst index 45c7286..c361a56 100644 --- a/README.rst +++ b/README.rst @@ -139,7 +139,7 @@ If you're not using a library, you can parse any page using the following code from urllib.parse import urlencode from urllib.request import urlopen import mwparserfromhell - API_URL = "http://en.wikipedia.org/w/api.php" + API_URL = "https://en.wikipedia.org/w/api.php" def parse(title): data = {"action": "query", "prop": "revisions", "rvlimit": 1, @@ -156,7 +156,6 @@ If you're not using a library, you can parse any page using the following code .. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm .. _GitHub: https://github.com/earwig/mwparserfromhell .. _Python Package Index: http://pypi.python.org -.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat .. _get pip: http://pypi.python.org/pypi/pip .. _EarwigBot: https://github.com/earwig/earwigbot .. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000..4ed112a --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,64 @@ +# This config file is used by appveyor.com to build Windows release binaries + +version: 0.4.1-b{build} + +branches: + only: + - master + +skip_tags: true + +environment: + global: + # See: http://stackoverflow.com/a/13751649/163740 + WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd" + PIP: "%WRAPPER% %PYTHON%\\Scripts\\pip.exe" + SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension" + PYPI_USERNAME: "earwigbot" + PYPI_PASSWORD: + secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+ + + matrix: + - PYTHON: "C:\\Python27" + PYTHON_VERSION: "2.7" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python27-x64" + PYTHON_VERSION: "2.7" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python33" + PYTHON_VERSION: "3.3" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python33-x64" + PYTHON_VERSION: "3.3" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python34" + PYTHON_VERSION: "3.4" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python34-x64" + PYTHON_VERSION: "3.4" + PYTHON_ARCH: "64" + +install: + - "%PIP% install wheel twine" + +build_script: + - "%SETUPPY% build" + +test_script: + - "%SETUPPY% -q test" + +after_test: + - "%SETUPPY% bdist_wheel" + +on_success: + - "twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%" + +artifacts: + - path: dist\* + +deploy: off diff --git a/docs/changelog.rst b/docs/changelog.rst index 9811b5c..b108b9a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,11 +1,43 @@ Changelog ========= +v0.4.1 +------ + +`Released July 30, 2015 `_ +(`changes `__): + +- The process for building Windows binaries has been fixed, and these should be + distributed along with new releases. Windows users can now take advantage of + C speedups without having a compiler of their own. +- Added support for Python 3.5. +- ``<`` and ``>`` are now disallowed in wikilink titles and template names. + This includes when denoting tags, but not comments. +- Fixed the behavior of *preserve_spacing* in :meth:`.Template.add` and + *keep_field* in :meth:`.Template.remove` on parameters with hidden keys. +- Removed :meth:`._ListProxy.detach`. :class:`.SmartList`\ s now use weak + references and their children are garbage-collected properly. +- Fixed parser bugs involving: + + - templates with completely blank names; + - templates with newlines and comments. + +- Heavy refactoring and fixes to the C tokenizer, including: + + - corrected a design flaw in text handling, allowing for substantial speed + improvements when parsing long strings of plain text; + - implemented new Python 3.3 + `PEP 393 `_ Unicode APIs. + +- Fixed various bugs in :class:`.SmartList`, including one that was causing + memory issues on 64-bit builds of Python 2 on Windows. +- Fixed some bugs in the release scripts. + v0.4 ---- -Unreleased -(`changes `__): +`Released May 23, 2015 `_ +(`changes `__): - The parser now falls back on pure Python mode if C extensions cannot be built. This fixes an issue that prevented some Windows users from installing diff --git a/docs/index.rst b/docs/index.rst index 988f5e7..9a6c8ab 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,24 +17,22 @@ Development occurs on GitHub_. Installation ------------ -The easiest way to install the parser is through the `Python Package Index`_, -so you can install the latest release with ``pip install mwparserfromhell`` -(`get pip`_). Alternatively, get the latest development version:: +The easiest way to install the parser is through the `Python Package Index`_; +you can install the latest release with ``pip install mwparserfromhell`` +(`get pip`_). On Windows, make sure you have the latest version of pip +installed by running ``pip install --upgrade pip``. + +Alternatively, get the latest development version:: git clone https://github.com/earwig/mwparserfromhell.git cd mwparserfromhell python setup.py install -If you get ``error: Unable to find vcvarsall.bat`` while installing, this is -because Windows can't find the compiler for C extensions. Consult this -`StackOverflow question`_ for help. You can also set ``ext_modules`` in -``setup.py`` to an empty list to prevent the extension from building. - -You can run the comprehensive unit testing suite with ``python setup.py test``. +You can run the comprehensive unit testing suite with +``python setup.py test -q``. .. _Python Package Index: http://pypi.python.org .. _get pip: http://pypi.python.org/pypi/pip -.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat Contents -------- diff --git a/docs/integration.rst b/docs/integration.rst index bbd00bb..af3abc9 100644 --- a/docs/integration.rst +++ b/docs/integration.rst @@ -25,7 +25,7 @@ If you're not using a library, you can parse any page using the following code from urllib.parse import urlencode from urllib.request import urlopen import mwparserfromhell - API_URL = "http://en.wikipedia.org/w/api.php" + API_URL = "https://en.wikipedia.org/w/api.php" def parse(title): data = {"action": "query", "prop": "revisions", "rvlimit": 1, diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 94b6e03..cb95c10 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.4" +__version__ = "0.4.1" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin, diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py index 590a271..7a83cd1 100644 --- a/mwparserfromhell/compat.py +++ b/mwparserfromhell/compat.py @@ -18,14 +18,12 @@ if py3k: bytes = bytes str = str range = range - maxsize = sys.maxsize import html.entities as htmlentities else: bytes = str str = unicode range = xrange - maxsize = sys.maxint import htmlentitydefs as htmlentities del sys diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index e0ba16b..cdacb3d 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -81,10 +81,8 @@ def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY -def is_scheme(scheme, slashes=True, reverse=False): +def is_scheme(scheme, slashes=True): """Return whether *scheme* is valid for external links.""" - if reverse: # Convenience for C - scheme = scheme[::-1] scheme = scheme.lower() if slashes: return scheme in URI_SCHEMES diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 7cbeb7d..4ee5f5d 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -82,21 +82,11 @@ class Template(Node): if char in node: code.replace(node, node.replace(char, replacement), False) - def _blank_param_value(self, value): - """Remove the content from *value* while keeping its whitespace. - - Replace *value*\ 's nodes with two text nodes, the first containing - whitespace from before its content and the second containing whitespace - from after its content. - """ - match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS) - value.nodes = [Text(match.group(1)), Text(match.group(2))] - def _select_theory(self, theories): """Return the most likely spacing convention given different options. - Given a dictionary of convention options as keys and their occurrence as - values, return the convention that occurs the most, or ``None`` if + Given a dictionary of convention options as keys and their occurrence + as values, return the convention that occurs the most, or ``None`` if there is no clear preferred style. """ if theories: @@ -129,34 +119,47 @@ class Template(Node): after = self._select_theory(after_theories) return before, after - def _remove_with_field(self, param, i, name): - """Return True if a parameter name should be kept, otherwise False.""" - if param.showkey: - following = self.params[i+1:] - better_matches = [after.name.strip() == name and not after.showkey for after in following] - if any(better_matches): - return False - return True - - def _remove_without_field(self, param, i): - """Return False if a parameter name should be kept, otherwise True.""" - if not param.showkey: - dependents = [not after.showkey for after in self.params[i+1:]] - if any(dependents): - return False - return True + def _blank_param_value(self, value): + """Remove the content from *value* while keeping its whitespace. + + Replace *value*\ 's nodes with two text nodes, the first containing + whitespace from before its content and the second containing whitespace + from after its content. + """ + match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS) + value.nodes = [Text(match.group(1)), Text(match.group(2))] + + def _fix_dependendent_params(self, i): + """Unhide keys if necessary after removing the param at index *i*.""" + if not self.params[i].showkey: + for param in self.params[i + 1:]: + if not param.showkey: + param.showkey = True def _remove_exact(self, needle, keep_field): """Remove a specific parameter, *needle*, from the template.""" for i, param in enumerate(self.params): if param is needle: - if keep_field or not self._remove_without_field(param, i): + if keep_field: self._blank_param_value(param.value) else: + self._fix_dependendent_params(i) self.params.pop(i) return raise ValueError(needle) + def _should_remove(self, i, name): + """Look ahead for a parameter with the same name, but hidden. + + If one exists, we should remove the given one rather than blanking it. + """ + if self.params[i].showkey: + following = self.params[i + 1:] + better_matches = [after.name.strip() == name and not after.showkey + for after in following] + return any(better_matches) + return False + @property def name(self): """The name of the template, as a :class:`.Wikicode` object.""" @@ -213,26 +216,25 @@ class Template(Node): :func:`.utils.parse_anything`; pipes and equal signs are automatically escaped from *value* when appropriate. + If *name* is already a parameter in the template, we'll replace its + value. + If *showkey* is given, this will determine whether or not to show the parameter's name (e.g., ``{{foo|bar}}``'s parameter has a name of ``"1"`` but it is hidden); otherwise, we'll make a safe and intelligent guess. - If *name* is already a parameter in the template, we'll replace its - value while keeping the same whitespace around it. We will also try to - guess the dominant spacing convention when adding a new parameter using - :meth:`_get_spacing_conventions`. - If *before* is given (either a :class:`.Parameter` object or a name), then we will place the parameter immediately before this one. Otherwise, it will be added at the end. If *before* is a name and exists multiple times in the template, we will place it before the last occurrence. If *before* is not in the template, :exc:`ValueError` is - raised. The argument is ignored if the new parameter already exists. + raised. The argument is ignored if *name* is an existing parameter. - If *preserve_spacing* is ``False``, we will avoid preserving spacing - conventions when changing the value of an existing parameter or when - adding a new one. + If *preserve_spacing* is ``True``, we will try to preserve whitespace + conventions around the parameter, whether it is new or we are updating + an existing value. It is disabled for parameters with hidden keys, + since MediaWiki doesn't strip whitespace in this case. """ name, value = parse_anything(name), parse_anything(value) self._surface_escape(value, "|") @@ -245,7 +247,7 @@ class Template(Node): if not existing.showkey: self._surface_escape(value, "=") nodes = existing.value.nodes - if preserve_spacing: + if preserve_spacing and existing.showkey: for i in range(2): # Ignore empty text nodes if not nodes[i]: nodes[i] = None @@ -271,7 +273,7 @@ class Template(Node): if not showkey: self._surface_escape(value, "=") - if preserve_spacing: + if preserve_spacing and showkey: before_n, after_n = self._get_spacing_conventions(use_names=True) before_v, after_v = self._get_spacing_conventions(use_names=False) name = parse_anything([before_n, name, after_n]) @@ -294,36 +296,39 @@ class Template(Node): and :meth:`get`. If *keep_field* is ``True``, we will keep the parameter's name, but - blank its value. Otherwise, we will remove the parameter completely - *unless* other parameters are dependent on it (e.g. removing ``bar`` - from ``{{foo|bar|baz}}`` is unsafe because ``{{foo|baz}}`` is not what - we expected, so ``{{foo||baz}}`` will be produced instead). + blank its value. Otherwise, we will remove the parameter completely. + + When removing a parameter with a hidden name, subsequent parameters + with hidden names will be made visible. For example, removing ``bar`` + from ``{{foo|bar|baz}}`` produces ``{{foo|2=baz}}`` because + ``{{foo|baz}}`` is incorrect. If the parameter shows up multiple times in the template and *param* is not a :class:`.Parameter` object, we will remove all instances of it - (and keep only one if *keep_field* is ``True`` - the first instance if - none have dependents, otherwise the one with dependents will be kept). + (and keep only one if *keep_field* is ``True`` - either the one with a + hidden name, if it exists, or the first instance). """ if isinstance(param, Parameter): return self._remove_exact(param, keep_field) + name = str(param).strip() removed = False to_remove = [] + for i, param in enumerate(self.params): if param.name.strip() == name: if keep_field: - if self._remove_with_field(param, i, name): - self._blank_param_value(param.value) - keep_field = False - else: - to_remove.append(i) - else: - if self._remove_without_field(param, i): + if self._should_remove(i, name): to_remove.append(i) else: self._blank_param_value(param.value) + keep_field = False + else: + self._fix_dependendent_params(i) + to_remove.append(i) if not removed: removed = True + if not removed: raise ValueError(name) for i in reversed(to_remove): diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index ae13c76..cbe58c5 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -40,11 +40,11 @@ class ParserError(Exception): from .builder import Builder -from .tokenizer import Tokenizer try: from ._tokenizer import CTokenizer use_c = True except ImportError: + from .tokenizer import Tokenizer CTokenizer = None use_c = False @@ -70,6 +70,7 @@ class Parser(object): if use_c and CTokenizer: self._tokenizer = CTokenizer() else: + from .tokenizer import Tokenizer self._tokenizer = Tokenizer() self._builder = Builder() diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index e98d8f7..b676e86 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -89,6 +89,7 @@ Local (stack-specific) contexts: * :const:`FAIL_ON_LBRACE` * :const:`FAIL_ON_RBRACE` * :const:`FAIL_ON_EQUALS` + * :const:`HAS_TEMPLATE` * :const:`TABLE` @@ -161,15 +162,16 @@ FAIL_NEXT = 1 << 26 FAIL_ON_LBRACE = 1 << 27 FAIL_ON_RBRACE = 1 << 28 FAIL_ON_EQUALS = 1 << 29 +HAS_TEMPLATE = 1 << 30 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + - FAIL_ON_RBRACE + FAIL_ON_EQUALS) - -TABLE_OPEN = 1 << 30 -TABLE_CELL_OPEN = 1 << 31 -TABLE_CELL_STYLE = 1 << 32 -TABLE_ROW_OPEN = 1 << 33 -TABLE_TD_LINE = 1 << 34 -TABLE_TH_LINE = 1 << 35 + FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE) + +TABLE_OPEN = 1 << 31 +TABLE_CELL_OPEN = 1 << 32 +TABLE_CELL_STYLE = 1 << 33 +TABLE_ROW_OPEN = 1 << 34 +TABLE_TD_LINE = 1 << 35 +TABLE_TH_LINE = 1 << 36 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + TABLE_TD_LINE + TABLE_TH_LINE) diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h new file mode 100644 index 0000000..abade02 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -0,0 +1,125 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html +#endif + +#include +#include +#include + +/* Compatibility macros */ + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +#ifndef uint64_t +#define uint64_t unsigned PY_LONG_LONG +#endif + +#define malloc PyObject_Malloc // XXX: yuck +#define realloc PyObject_Realloc +#define free PyObject_Free + +/* Unicode support macros */ + +#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3 +#define PEP_393 +#endif + +#ifdef PEP_393 +#define Unicode Py_UCS4 +#define PyUnicode_FROM_SINGLE(chr) \ + PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) +#else +#define Unicode Py_UNICODE +#define PyUnicode_FROM_SINGLE(chr) \ + PyUnicode_FromUnicode(&(chr), 1) +#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE +#endif + +/* Error handling macros */ + +#define BAD_ROUTE self->route_state +#define BAD_ROUTE_CONTEXT self->route_context +#define FAIL_ROUTE(context) { \ + self->route_state = 1; \ + self->route_context = context; \ + } +#define RESET_ROUTE() self->route_state = 0 + +/* Shared globals */ + +extern char** entitydefs; + +extern PyObject* NOARGS; +extern PyObject* definitions; + +/* Structs */ + +typedef struct { + Py_ssize_t capacity; + Py_ssize_t length; +#ifdef PEP_393 + PyObject* object; + int kind; + void* data; +#else + Py_UNICODE* data; +#endif +} Textbuffer; + +struct Stack { + PyObject* stack; + uint64_t context; + Textbuffer* textbuffer; + struct Stack* next; +}; +typedef struct Stack Stack; + +typedef struct { + PyObject* object; /* base PyUnicodeObject object */ + Py_ssize_t length; /* length of object, in code points */ +#ifdef PEP_393 + int kind; /* object's kind value */ + void* data; /* object's raw unicode buffer */ +#else + Py_UNICODE* buf; /* object's internal buffer */ +#endif +} TokenizerInput; + +typedef struct { + PyObject_HEAD + TokenizerInput text; /* text to tokenize */ + Stack* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + int global; /* global context */ + int depth; /* stack recursion depth */ + int cycles; /* total number of stack recursions */ + int route_state; /* whether a BadRoute has been triggered */ + uint64_t route_context; /* context when the last BadRoute was triggered */ + int skip_style_tags; /* temp fix for the sometimes broken tag parser */ +} Tokenizer; diff --git a/mwparserfromhell/parser/ctokenizer/contexts.h b/mwparserfromhell/parser/ctokenizer/contexts.h new file mode 100644 index 0000000..4e4a8c7 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/contexts.h @@ -0,0 +1,105 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +/* Local contexts */ + +#define LC_TEMPLATE 0x0000000000000007 +#define LC_TEMPLATE_NAME 0x0000000000000001 +#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 +#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 + +#define LC_ARGUMENT 0x0000000000000018 +#define LC_ARGUMENT_NAME 0x0000000000000008 +#define LC_ARGUMENT_DEFAULT 0x0000000000000010 + +#define LC_WIKILINK 0x0000000000000060 +#define LC_WIKILINK_TITLE 0x0000000000000020 +#define LC_WIKILINK_TEXT 0x0000000000000040 + +#define LC_EXT_LINK 0x0000000000000180 +#define LC_EXT_LINK_URI 0x0000000000000080 +#define LC_EXT_LINK_TITLE 0x0000000000000100 + +#define LC_HEADING 0x0000000000007E00 +#define LC_HEADING_LEVEL_1 0x0000000000000200 +#define LC_HEADING_LEVEL_2 0x0000000000000400 +#define LC_HEADING_LEVEL_3 0x0000000000000800 +#define LC_HEADING_LEVEL_4 0x0000000000001000 +#define LC_HEADING_LEVEL_5 0x0000000000002000 +#define LC_HEADING_LEVEL_6 0x0000000000004000 + +#define LC_TAG 0x0000000000078000 +#define LC_TAG_OPEN 0x0000000000008000 +#define LC_TAG_ATTR 0x0000000000010000 +#define LC_TAG_BODY 0x0000000000020000 +#define LC_TAG_CLOSE 0x0000000000040000 + +#define LC_STYLE 0x0000000000780000 +#define LC_STYLE_ITALICS 0x0000000000080000 +#define LC_STYLE_BOLD 0x0000000000100000 +#define LC_STYLE_PASS_AGAIN 0x0000000000200000 +#define LC_STYLE_SECOND_PASS 0x0000000000400000 + +#define LC_DLTERM 0x0000000000800000 + +#define LC_SAFETY_CHECK 0x000000007F000000 +#define LC_HAS_TEXT 0x0000000001000000 +#define LC_FAIL_ON_TEXT 0x0000000002000000 +#define LC_FAIL_NEXT 0x0000000004000000 +#define LC_FAIL_ON_LBRACE 0x0000000008000000 +#define LC_FAIL_ON_RBRACE 0x0000000010000000 +#define LC_FAIL_ON_EQUALS 0x0000000020000000 +#define LC_HAS_TEMPLATE 0x0000000040000000 + +#define LC_TABLE 0x0000001F80000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000 +#define LC_TABLE_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_OPEN 0x0000000100000000 +#define LC_TABLE_CELL_STYLE 0x0000000200000000 +#define LC_TABLE_ROW_OPEN 0x0000000400000000 +#define LC_TABLE_TD_LINE 0x0000000800000000 +#define LC_TABLE_TH_LINE 0x0000001000000000 + +/* Global contexts */ + +#define GL_HEADING 0x1 + +/* Aggregate contexts */ + +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) +#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) +#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) + +/* Tag contexts */ + +#define TAG_NAME 0x01 +#define TAG_ATTR_READY 0x02 +#define TAG_ATTR_NAME 0x04 +#define TAG_ATTR_VALUE 0x08 +#define TAG_QUOTED 0x10 +#define TAG_NOTE_SPACE 0x20 +#define TAG_NOTE_EQUALS 0x40 +#define TAG_NOTE_QUOTE 0x80 diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.c b/mwparserfromhell/parser/ctokenizer/tag_data.c new file mode 100644 index 0000000..2f67966 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tag_data.c @@ -0,0 +1,78 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "tag_data.h" +#include "contexts.h" + +/* + Initialize a new TagData object. +*/ +TagData* TagData_new(TokenizerInput* text) +{ +#define ALLOC_BUFFER(name) \ + name = Textbuffer_new(text); \ + if (!name) { \ + TagData_dealloc(self); \ + return NULL; \ + } + + TagData *self = malloc(sizeof(TagData)); + if (!self) { + PyErr_NoMemory(); + return NULL; + } + self->context = TAG_NAME; + ALLOC_BUFFER(self->pad_first) + ALLOC_BUFFER(self->pad_before_eq) + ALLOC_BUFFER(self->pad_after_eq) + self->quoter = 0; + self->reset = 0; + return self; + +#undef ALLOC_BUFFER +} + +/* + Deallocate the given TagData object. +*/ +void TagData_dealloc(TagData* self) +{ + if (self->pad_first) + Textbuffer_dealloc(self->pad_first); + if (self->pad_before_eq) + Textbuffer_dealloc(self->pad_before_eq); + if (self->pad_after_eq) + Textbuffer_dealloc(self->pad_after_eq); + free(self); +} + +/* + Clear the internal buffers of the given TagData object. +*/ +int TagData_reset_buffers(TagData* self) +{ + if (Textbuffer_reset(self->pad_first) || + Textbuffer_reset(self->pad_before_eq) || + Textbuffer_reset(self->pad_after_eq)) + return -1; + return 0; +} diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.h b/mwparserfromhell/parser/ctokenizer/tag_data.h new file mode 100644 index 0000000..f184081 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/tag_data.h @@ -0,0 +1,43 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#include "common.h" +#include "textbuffer.h" + +/* Structs */ + +typedef struct { + uint64_t context; + Textbuffer* pad_first; + Textbuffer* pad_before_eq; + Textbuffer* pad_after_eq; + Unicode quoter; + Py_ssize_t reset; +} TagData; + +/* Functions */ + +TagData* TagData_new(TokenizerInput*); +void TagData_dealloc(TagData*); +int TagData_reset_buffers(TagData*); diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c new file mode 100644 index 0000000..0c711c5 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c @@ -0,0 +1,232 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "textbuffer.h" + +#define INITIAL_CAPACITY 32 +#define RESIZE_FACTOR 2 +#define CONCAT_EXTRA 32 + +/* + Internal allocation function for textbuffers. +*/ +static int internal_alloc(Textbuffer* self, Unicode maxchar) +{ + self->capacity = INITIAL_CAPACITY; + self->length = 0; + +#ifdef PEP_393 + self->object = PyUnicode_New(self->capacity, maxchar); + if (!self->object) + return -1; + self->kind = PyUnicode_KIND(self->object); + self->data = PyUnicode_DATA(self->object); +#else + (void) maxchar; // Unused + self->data = malloc(sizeof(Unicode) * self->capacity); + if (!self->data) + return -1; +#endif + + return 0; +} + +/* + Internal deallocation function for textbuffers. +*/ +static void internal_dealloc(Textbuffer* self) +{ +#ifdef PEP_393 + Py_DECREF(self->object); +#else + free(self->data); +#endif +} + +/* + Internal resize function. +*/ +static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) +{ +#ifdef PEP_393 + PyObject *newobj; + void *newdata; + + newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object)); + if (!newobj) + return -1; + newdata = PyUnicode_DATA(newobj); + memcpy(newdata, self->data, self->length * self->kind); + Py_DECREF(self->object); + self->object = newobj; + self->data = newdata; +#else + if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap))) + return -1; +#endif + + self->capacity = new_cap; + return 0; +} + +/* + Create a new textbuffer object. +*/ +Textbuffer* Textbuffer_new(TokenizerInput* text) +{ + Textbuffer* self = malloc(sizeof(Textbuffer)); + Unicode maxchar = 0; + +#ifdef PEP_393 + maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); +#endif + + if (!self) + goto fail_nomem; + if (internal_alloc(self, maxchar) < 0) + goto fail_dealloc; + return self; + + fail_dealloc: + free(self); + fail_nomem: + PyErr_NoMemory(); + return NULL; +} + +/* + Deallocate the given textbuffer. +*/ +void Textbuffer_dealloc(Textbuffer* self) +{ + internal_dealloc(self); + free(self); +} + +/* + Reset a textbuffer to its initial, empty state. +*/ +int Textbuffer_reset(Textbuffer* self) +{ + Unicode maxchar = 0; + +#ifdef PEP_393 + maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); +#endif + + internal_dealloc(self); + if (internal_alloc(self, maxchar)) + return -1; + return 0; +} + +/* + Write a Unicode codepoint to the given textbuffer. +*/ +int Textbuffer_write(Textbuffer* self, Unicode code) +{ + if (self->length >= self->capacity) { + if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) + return -1; + } + +#ifdef PEP_393 + PyUnicode_WRITE(self->kind, self->data, self->length++, code); +#else + self->data[self->length++] = code; +#endif + + return 0; +} + +/* + Read a Unicode codepoint from the given index of the given textbuffer. + + This function does not check for bounds. +*/ +Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index) +{ +#ifdef PEP_393 + return PyUnicode_READ(self->kind, self->data, index); +#else + return self->data[index]; +#endif +} + +/* + Return the contents of the textbuffer as a Python Unicode object. +*/ +PyObject* Textbuffer_render(Textbuffer* self) +{ +#ifdef PEP_393 + return PyUnicode_FromKindAndData(self->kind, self->data, self->length); +#else + return PyUnicode_FromUnicode(self->data, self->length); +#endif +} + +/* + Concatenate the 'other' textbuffer onto the end of the given textbuffer. +*/ +int Textbuffer_concat(Textbuffer* self, Textbuffer* other) +{ + Py_ssize_t newlen = self->length + other->length; + + if (newlen > self->capacity) { + if (internal_resize(self, newlen + CONCAT_EXTRA) < 0) + return -1; + } + +#ifdef PEP_393 + assert(self->kind == other->kind); + memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data, + other->length * other->kind); +#else + memcpy(self->data + self->length, other->data, + other->length * sizeof(Unicode)); +#endif + + self->length = newlen; + return 0; +} + +/* + Reverse the contents of the given textbuffer. +*/ +void Textbuffer_reverse(Textbuffer* self) +{ + Py_ssize_t i, end = self->length - 1; + Unicode tmp; + + for (i = 0; i < self->length / 2; i++) { +#ifdef PEP_393 + tmp = PyUnicode_READ(self->kind, self->data, i); + PyUnicode_WRITE(self->kind, self->data, i, + PyUnicode_READ(self->kind, self->data, end - i)); + PyUnicode_WRITE(self->kind, self->data, end - i, tmp); +#else + tmp = self->data[i]; + self->data[i] = self->data[end - i]; + self->data[end - i] = tmp; +#endif + } +} diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h new file mode 100644 index 0000000..123d240 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h @@ -0,0 +1,36 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#include "common.h" + +/* Functions */ + +Textbuffer* Textbuffer_new(TokenizerInput*); +void Textbuffer_dealloc(Textbuffer*); +int Textbuffer_reset(Textbuffer*); +int Textbuffer_write(Textbuffer*, Unicode); +Unicode Textbuffer_read(Textbuffer*, Py_ssize_t); +PyObject* Textbuffer_render(Textbuffer*); +int Textbuffer_concat(Textbuffer*, Textbuffer*); +void Textbuffer_reverse(Textbuffer*); diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c similarity index 72% rename from mwparserfromhell/parser/tokenizer.c rename to mwparserfromhell/parser/ctokenizer/tok_parse.c index c125021..23cc246 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,4 @@ /* -Tokenizer for MWParserFromHell Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -21,12 +20,42 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "tokenizer.h" +#include "tok_parse.h" +#include "contexts.h" +#include "tag_data.h" +#include "tok_support.h" +#include "tokens.h" + +#define DIGITS "0123456789" +#define HEXDIGITS "0123456789abcdefABCDEF" +#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + +#define MAX_BRACES 255 +#define MAX_ENTITY_SIZE 8 + +#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) +#define IS_SCHEME(scheme, slashes) \ + (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False)) + +typedef struct { + PyObject* title; + int level; +} HeadingData; + +/* Forward declarations */ + +static int Tokenizer_parse_entity(Tokenizer*); +static int Tokenizer_parse_comment(Tokenizer*); +static int Tokenizer_handle_dl_term(Tokenizer*); +static int Tokenizer_parse_tag(Tokenizer*); /* - Determine whether the given Py_UNICODE is a marker. + Determine whether the given code point is a marker. */ -static int is_marker(Py_UNICODE this) +static int is_marker(Unicode this) { int i; @@ -40,7 +69,7 @@ static int is_marker(Py_UNICODE this) /* Given a context, return the heading level encoded within it. */ -static int heading_level_from_context(int n) +static int heading_level_from_context(uint64_t n) { int level; @@ -51,14 +80,13 @@ static int heading_level_from_context(int n) } /* - Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as + Call the given function in definitions.py, using 'in1' and 'in2' as parameters, and return its output as a bool. */ -static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, - PyObject* in3) +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) { PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -89,505 +117,19 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) return lowered; } -static Textbuffer* Textbuffer_new(void) -{ - Textbuffer* buffer = malloc(sizeof(Textbuffer)); - - if (!buffer) { - PyErr_NoMemory(); - return NULL; - } - buffer->size = 0; - buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE); - if (!buffer->data) { - free(buffer); - PyErr_NoMemory(); - return NULL; - } - buffer->prev = buffer->next = NULL; - return buffer; -} - -static void Textbuffer_dealloc(Textbuffer* self) -{ - Textbuffer* next; - - while (self) { - free(self->data); - next = self->next; - free(self); - self = next; - } -} - -/* - Write a Unicode codepoint to the given textbuffer. -*/ -static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) -{ - Textbuffer* self = *this; - - if (self->size == TEXTBUFFER_BLOCKSIZE) { - Textbuffer* new = Textbuffer_new(); - if (!new) - return -1; - new->next = self; - self->prev = new; - *this = self = new; - } - self->data[self->size++] = code; - return 0; -} - -/* - Return the contents of the textbuffer as a Python Unicode object. -*/ -static PyObject* Textbuffer_render(Textbuffer* self) -{ - PyObject *result = PyUnicode_FromUnicode(self->data, self->size); - PyObject *left, *concat; - - while (self->next) { - self = self->next; - left = PyUnicode_FromUnicode(self->data, self->size); - concat = PyUnicode_Concat(left, result); - Py_DECREF(left); - Py_DECREF(result); - result = concat; - } - return result; -} - -static TagData* TagData_new(void) -{ - TagData *self = malloc(sizeof(TagData)); - - #define ALLOC_BUFFER(name) \ - name = Textbuffer_new(); \ - if (!name) { \ - TagData_dealloc(self); \ - return NULL; \ - } - - if (!self) { - PyErr_NoMemory(); - return NULL; - } - self->context = TAG_NAME; - ALLOC_BUFFER(self->pad_first) - ALLOC_BUFFER(self->pad_before_eq) - ALLOC_BUFFER(self->pad_after_eq) - self->quoter = self->reset = 0; - return self; -} - -static void TagData_dealloc(TagData* self) -{ - #define DEALLOC_BUFFER(name) \ - if (name) \ - Textbuffer_dealloc(name); - - DEALLOC_BUFFER(self->pad_first); - DEALLOC_BUFFER(self->pad_before_eq); - DEALLOC_BUFFER(self->pad_after_eq); - free(self); -} - -static int TagData_reset_buffers(TagData* self) -{ - #define RESET_BUFFER(name) \ - Textbuffer_dealloc(name); \ - name = Textbuffer_new(); \ - if (!name) \ - return -1; - - RESET_BUFFER(self->pad_first) - RESET_BUFFER(self->pad_before_eq) - RESET_BUFFER(self->pad_after_eq) - return 0; -} - -static PyObject* -Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) -{ - Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0); - return (PyObject*) self; -} - -static void Tokenizer_dealloc(Tokenizer* self) -{ - Stack *this = self->topstack, *next; - Py_XDECREF(self->text); - - while (this) { - Py_DECREF(this->stack); - Textbuffer_dealloc(this->textbuffer); - next = this->next; - free(this); - this = next; - } - Py_TYPE(self)->tp_free((PyObject*) self); -} - -static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) -{ - static char* kwlist[] = {NULL}; - - if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) - return -1; - self->text = Py_None; - Py_INCREF(Py_None); - self->topstack = NULL; - self->head = self->length = self->global = self->depth = self->cycles = 0; - return 0; -} - -/* - Add a new token stack, context, and textbuffer to the list. -*/ -static int Tokenizer_push(Tokenizer* self, uint64_t context) -{ - Stack* top = malloc(sizeof(Stack)); - - if (!top) { - PyErr_NoMemory(); - return -1; - } - top->stack = PyList_New(0); - top->context = context; - top->textbuffer = Textbuffer_new(); - if (!top->textbuffer) - return -1; - top->next = self->topstack; - self->topstack = top; - self->depth++; - self->cycles++; - return 0; -} - -/* - Push the textbuffer onto the stack as a Text node and clear it. -*/ -static int Tokenizer_push_textbuffer(Tokenizer* self) -{ - PyObject *text, *kwargs, *token; - Textbuffer* buffer = self->topstack->textbuffer; - - if (buffer->size == 0 && !buffer->next) - return 0; - text = Textbuffer_render(buffer); - if (!text) - return -1; - kwargs = PyDict_New(); - if (!kwargs) { - Py_DECREF(text); - return -1; - } - PyDict_SetItemString(kwargs, "text", text); - Py_DECREF(text); - token = PyObject_Call(Text, NOARGS, kwargs); - Py_DECREF(kwargs); - if (!token) - return -1; - if (PyList_Append(self->topstack->stack, token)) { - Py_DECREF(token); - return -1; - } - Py_DECREF(token); - Textbuffer_dealloc(buffer); - self->topstack->textbuffer = Textbuffer_new(); - if (!self->topstack->textbuffer) - return -1; - return 0; -} - -/* - Pop and deallocate the top token stack/context/textbuffer. -*/ -static void Tokenizer_delete_top_of_stack(Tokenizer* self) -{ - Stack* top = self->topstack; - - Py_DECREF(top->stack); - Textbuffer_dealloc(top->textbuffer); - self->topstack = top->next; - free(top); - self->depth--; -} - -/* - Pop the current stack/context/textbuffer, returing the stack. -*/ -static PyObject* Tokenizer_pop(Tokenizer* self) -{ - PyObject* stack; - - if (Tokenizer_push_textbuffer(self)) - return NULL; - stack = self->topstack->stack; - Py_INCREF(stack); - Tokenizer_delete_top_of_stack(self); - return stack; -} - -/* - Pop the current stack/context/textbuffer, returing the stack. We will also - replace the underlying stack's context with the current stack's. -*/ -static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) -{ - PyObject* stack; - uint64_t context; - - if (Tokenizer_push_textbuffer(self)) - return NULL; - stack = self->topstack->stack; - Py_INCREF(stack); - context = self->topstack->context; - Tokenizer_delete_top_of_stack(self); - self->topstack->context = context; - return stack; -} - -/* - Fail the current tokenization route. Discards the current - stack/context/textbuffer and sets the BAD_ROUTE flag. -*/ -static void* Tokenizer_fail_route(Tokenizer* self) -{ - uint64_t context = self->topstack->context; - PyObject* stack = Tokenizer_pop(self); - - Py_XDECREF(stack); - FAIL_ROUTE(context); - return NULL; -} - -/* - Write a token to the current token stack. -*/ -static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) -{ - PyObject* instance; - - if (Tokenizer_push_textbuffer(self)) - return -1; - instance = PyObject_CallObject(token, NULL); - if (!instance) - return -1; - if (first ? PyList_Insert(self->topstack->stack, 0, instance) : - PyList_Append(self->topstack->stack, instance)) { - Py_DECREF(instance); - return -1; - } - Py_DECREF(instance); - return 0; -} - -/* - Write a token to the current token stack, with kwargs. Steals a reference - to kwargs. -*/ -static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, - PyObject* kwargs, int first) -{ - PyObject* instance; - - if (Tokenizer_push_textbuffer(self)) { - Py_DECREF(kwargs); - return -1; - } - instance = PyObject_Call(token, NOARGS, kwargs); - if (!instance) { - Py_DECREF(kwargs); - return -1; - } - if (first ? PyList_Insert(self->topstack->stack, 0, instance): - PyList_Append(self->topstack->stack, instance)) { - Py_DECREF(instance); - Py_DECREF(kwargs); - return -1; - } - Py_DECREF(instance); - Py_DECREF(kwargs); - return 0; -} - -/* - Write a Unicode codepoint to the current textbuffer. -*/ -static int Tokenizer_emit_char(Tokenizer* self, Py_UNICODE code) -{ - return Textbuffer_write(&(self->topstack->textbuffer), code); -} - -/* - Write a string of text to the current textbuffer. -*/ -static int Tokenizer_emit_text(Tokenizer* self, const char* text) -{ - int i = 0; - - while (text[i]) { - if (Tokenizer_emit_char(self, text[i])) - return -1; - i++; - } - return 0; -} - -/* - Write the contents of another textbuffer to the current textbuffer, - deallocating it in the process. -*/ -static int -Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse) -{ - Textbuffer *original = buffer; - long i; - - if (reverse) { - do { - for (i = buffer->size - 1; i >= 0; i--) { - if (Tokenizer_emit_char(self, buffer->data[i])) { - Textbuffer_dealloc(original); - return -1; - } - } - } while ((buffer = buffer->next)); - } - else { - while (buffer->next) - buffer = buffer->next; - do { - for (i = 0; i < buffer->size; i++) { - if (Tokenizer_emit_char(self, buffer->data[i])) { - Textbuffer_dealloc(original); - return -1; - } - } - } while ((buffer = buffer->prev)); - } - Textbuffer_dealloc(original); - return 0; -} - -/* - Write a series of tokens to the current stack at once. -*/ -static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) -{ - int pushed = 0; - PyObject *stack, *token, *left, *right, *text; - Textbuffer* buffer; - Py_ssize_t size; - - if (PyList_GET_SIZE(tokenlist) > 0) { - token = PyList_GET_ITEM(tokenlist, 0); - switch (PyObject_IsInstance(token, Text)) { - case 0: - break; - case 1: { - pushed = 1; - buffer = self->topstack->textbuffer; - if (buffer->size == 0 && !buffer->next) - break; - left = Textbuffer_render(buffer); - if (!left) - return -1; - right = PyObject_GetAttrString(token, "text"); - if (!right) - return -1; - text = PyUnicode_Concat(left, right); - Py_DECREF(left); - Py_DECREF(right); - if (!text) - return -1; - if (PyObject_SetAttrString(token, "text", text)) { - Py_DECREF(text); - return -1; - } - Py_DECREF(text); - Textbuffer_dealloc(buffer); - self->topstack->textbuffer = Textbuffer_new(); - if (!self->topstack->textbuffer) - return -1; - break; - } - case -1: - return -1; - } - } - if (!pushed) { - if (Tokenizer_push_textbuffer(self)) - return -1; - } - stack = self->topstack->stack; - size = PyList_GET_SIZE(stack); - if (PyList_SetSlice(stack, size, size, tokenlist)) - return -1; - return 0; -} - -/* - Pop the current stack, write text, and then write the stack. 'text' is a - NULL-terminated array of chars. -*/ -static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) -{ - PyObject* stack = Tokenizer_pop(self); - - if (Tokenizer_emit_text(self, text)) { - Py_DECREF(stack); - return -1; - } - if (stack) { - if (PyList_GET_SIZE(stack) > 0) { - if (Tokenizer_emit_all(self, stack)) { - Py_DECREF(stack); - return -1; - } - } - Py_DECREF(stack); - } - self->head--; - return 0; -} - -/* - Read the value at a relative point in the wikicode, forwards. -*/ -static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) -{ - Py_ssize_t index = self->head + delta; - - if (index >= self->length) - return EMPTY; - return PyList_GET_ITEM(self->text, index); -} - -/* - Read the value at a relative point in the wikicode, backwards. -*/ -static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) -{ - Py_ssize_t index; - - if (delta > self->head) - return EMPTY; - index = self->head - delta; - return PyList_GET_ITEM(self->text, index); -} - /* Parse a template at the head of the wikicode string. */ -static int Tokenizer_parse_template(Tokenizer* self) +static int Tokenizer_parse_template(Tokenizer* self, int has_content) { PyObject *template; Py_ssize_t reset = self->head; + uint64_t context = LC_TEMPLATE_NAME; + + if (has_content) + context |= LC_HAS_TEMPLATE; - template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1); + template = Tokenizer_parse(self, context, 1); if (BAD_ROUTE) { self->head = reset; return 0; @@ -643,10 +185,11 @@ static int Tokenizer_parse_argument(Tokenizer* self) static int Tokenizer_parse_template_or_argument(Tokenizer* self) { unsigned int braces = 2, i; + int has_content = 0; PyObject *tokenlist; self->head += 2; - while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) { + while (Tokenizer_read(self, 0) == '{' && braces < MAX_BRACES) { self->head++; braces++; } @@ -659,7 +202,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) return 0; } if (braces == 2) { - if (Tokenizer_parse_template(self)) + if (Tokenizer_parse_template(self, has_content)) return -1; if (BAD_ROUTE) { RESET_ROUTE(); @@ -673,7 +216,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) return -1; if (BAD_ROUTE) { RESET_ROUTE(); - if (Tokenizer_parse_template(self)) + if (Tokenizer_parse_template(self, has_content)) return -1; if (BAD_ROUTE) { char text[MAX_BRACES + 1]; @@ -689,8 +232,10 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) } else braces -= 3; - if (braces) + if (braces) { + has_content = 1; self->head++; + } } tokenlist = Tokenizer_pop(self); if (!tokenlist) @@ -712,8 +257,13 @@ static int Tokenizer_handle_template_param(Tokenizer* self) { PyObject *stack; - if (self->topstack->context & LC_TEMPLATE_NAME) + if (self->topstack->context & LC_TEMPLATE_NAME) { + if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) { + Tokenizer_fail_route(self); + return -1; + } self->topstack->context ^= LC_TEMPLATE_NAME; + } else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { @@ -764,7 +314,11 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self) { PyObject* stack; - if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { + if (self->topstack->context & LC_TEMPLATE_NAME) { + if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) + return Tokenizer_fail_route(self); + } + else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { stack = Tokenizer_pop_keeping_context(self); if (!stack) return NULL; @@ -866,21 +420,21 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; Textbuffer* buffer; PyObject* scheme; - Py_UNICODE this; + Unicode this; int slashes, i; if (Tokenizer_push(self, LC_EXT_LINK_URI)) return -1; - if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') { + if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') { if (Tokenizer_emit_text(self, "//")) return -1; self->head += 2; } else { - buffer = Textbuffer_new(); + buffer = Textbuffer_new(&self->text); if (!buffer) return -1; - while ((this = Tokenizer_READ(self, 0))) { + while ((this = Tokenizer_read(self, 0))) { i = 0; while (1) { if (!valid[i]) @@ -889,7 +443,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) break; i++; } - Textbuffer_write(&buffer, this); + Textbuffer_write(buffer, this); if (Tokenizer_emit_char(self, this)) { Textbuffer_dealloc(buffer); return -1; @@ -907,8 +461,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) return -1; } self->head++; - slashes = (Tokenizer_READ(self, 0) == '/' && - Tokenizer_READ(self, 1) == '/'); + slashes = (Tokenizer_read(self, 0) == '/' && + Tokenizer_read(self, 1) == '/'); if (slashes) { if (Tokenizer_emit_text(self, "//")) { Textbuffer_dealloc(buffer); @@ -920,7 +474,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) Textbuffer_dealloc(buffer); if (!scheme) return -1; - if (!IS_SCHEME(scheme, slashes, 0)) { + if (!IS_SCHEME(scheme, slashes)) { Py_DECREF(scheme); Tokenizer_fail_route(self); return 0; @@ -936,46 +490,40 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) { static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; - Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; + Textbuffer *scheme_buffer = Textbuffer_new(&self->text); PyObject *scheme; - Py_UNICODE chunk; - long i; + Unicode chunk; + Py_ssize_t i; int slashes, j; if (!scheme_buffer) return -1; // We have to backtrack through the textbuffer looking for our scheme since // it was just parsed as text: - temp_buffer = self->topstack->textbuffer; - while (temp_buffer) { - for (i = temp_buffer->size - 1; i >= 0; i--) { - chunk = temp_buffer->data[i]; - if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) - goto end_of_loop; - j = 0; - while (1) { - if (!valid[j]) { - Textbuffer_dealloc(scheme_buffer); - FAIL_ROUTE(0); - return 0; - } - if (chunk == valid[j]) - break; - j++; + for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { + chunk = Textbuffer_read(self->topstack->textbuffer, i); + if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + goto end_of_loop; + j = 0; + do { + if (!valid[j]) { + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; } - Textbuffer_write(&scheme_buffer, chunk); - } - temp_buffer = temp_buffer->next; + } while (chunk != valid[j++]); + Textbuffer_write(scheme_buffer, chunk); } end_of_loop: + Textbuffer_reverse(scheme_buffer); scheme = Textbuffer_render(scheme_buffer); if (!scheme) { Textbuffer_dealloc(scheme_buffer); return -1; } - slashes = (Tokenizer_READ(self, 0) == '/' && - Tokenizer_READ(self, 1) == '/'); - if (!IS_SCHEME(scheme, slashes, 1)) { + slashes = (Tokenizer_read(self, 0) == '/' && + Tokenizer_read(self, 1) == '/'); + if (!IS_SCHEME(scheme, slashes)) { Py_DECREF(scheme); Textbuffer_dealloc(scheme_buffer); FAIL_ROUTE(0); @@ -986,7 +534,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) + if (Tokenizer_emit_textbuffer(self, scheme_buffer)) return -1; if (Tokenizer_emit_char(self, ':')) return -1; @@ -1001,29 +549,27 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) /* Handle text in a free external link, including trailing punctuation. */ -static int -Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, - Textbuffer** tail, Py_UNICODE this) +static int Tokenizer_handle_free_link_text( + Tokenizer* self, int* parens, Textbuffer* tail, Unicode this) { - #define PUSH_TAIL_BUFFER(tail, error) \ - if ((tail)->size || (tail)->next) { \ - if (Tokenizer_emit_textbuffer(self, tail, 0)) \ - return error; \ - tail = Textbuffer_new(); \ - if (!(tail)) \ - return error; \ + #define PUSH_TAIL_BUFFER(tail, error) \ + if (tail->length > 0) { \ + if (Textbuffer_concat(self->topstack->textbuffer, tail)) \ + return error; \ + if (Textbuffer_reset(tail)) \ + return error; \ } if (this == '(' && !(*parens)) { *parens = 1; - PUSH_TAIL_BUFFER(*tail, -1) + PUSH_TAIL_BUFFER(tail, -1) } else if (this == ',' || this == ';' || this == '\\' || this == '.' || this == ':' || this == '!' || this == '?' || (!(*parens) && this == ')')) return Textbuffer_write(tail, this); else - PUSH_TAIL_BUFFER(*tail, -1) + PUSH_TAIL_BUFFER(tail, -1) return Tokenizer_emit_char(self, this); } @@ -1031,10 +577,10 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Return whether the current head is the end of a free link. */ static int -Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) +Tokenizer_is_free_link(Tokenizer* self, Unicode this, Unicode next) { // Built from Tokenizer_parse()'s end sentinels: - Py_UNICODE after = Tokenizer_READ(self, 2); + Unicode after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || @@ -1050,9 +596,9 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) */ static PyObject* Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, - Textbuffer** extra) + Textbuffer* extra) { - Py_UNICODE this, next; + Unicode this, next; int parens = 0; if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : @@ -1060,23 +606,23 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, return NULL; if (BAD_ROUTE) return NULL; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this || this == '\n' || this == ' ' || this == ']') return Tokenizer_fail_route(self); if (!brackets && this == '[') return Tokenizer_fail_route(self); while (1) { - this = Tokenizer_READ(self, 0); - next = Tokenizer_READ(self, 1); + this = Tokenizer_read(self, 0); + next = Tokenizer_read(self, 1); if (this == '&') { - PUSH_TAIL_BUFFER(*extra, NULL) + PUSH_TAIL_BUFFER(extra, NULL) if (Tokenizer_parse_entity(self)) return NULL; } else if (this == '<' && next == '!' - && Tokenizer_READ(self, 2) == '-' - && Tokenizer_READ(self, 3) == '-') { - PUSH_TAIL_BUFFER(*extra, NULL) + && Tokenizer_read(self, 2) == '-' + && Tokenizer_read(self, 3) == '-') { + PUSH_TAIL_BUFFER(extra, NULL) if (Tokenizer_parse_comment(self)) return NULL; } @@ -1087,7 +633,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, else if (!this || this == '\n') return Tokenizer_fail_route(self); else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) { - PUSH_TAIL_BUFFER(*extra, NULL) + PUSH_TAIL_BUFFER(extra, NULL) if (Tokenizer_parse_template_or_argument(self)) return NULL; } @@ -1127,7 +673,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), *split, *scheme; Py_ssize_t length; - Textbuffer* temp; if (!text) return -1; @@ -1136,19 +681,9 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) if (!split) return -1; scheme = PyList_GET_ITEM(split, 0); - length = PyUnicode_GET_SIZE(scheme); - while (length) { - temp = self->topstack->textbuffer; - if (length <= temp->size) { - temp->size -= length; - break; - } - length -= temp->size; - self->topstack->textbuffer = temp->next; - free(temp->data); - free(temp); - } + length = PyUnicode_GET_LENGTH(scheme); Py_DECREF(split); + self->topstack->textbuffer->length -= length; return 0; } @@ -1161,20 +696,20 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) #define NOT_A_LINK \ if (!brackets && self->topstack->context & LC_DLTERM) \ return Tokenizer_handle_dl_term(self); \ - return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + return Tokenizer_emit_char(self, Tokenizer_read(self, 0)) Py_ssize_t reset = self->head; PyObject *link, *kwargs; - Textbuffer *extra = 0; + Textbuffer *extra; if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { NOT_A_LINK; } - extra = Textbuffer_new(); + extra = Textbuffer_new(&self->text); if (!extra) return -1; self->head++; - link = Tokenizer_really_parse_external_link(self, brackets, &extra); + link = Tokenizer_really_parse_external_link(self, brackets, extra); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; @@ -1214,8 +749,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) Textbuffer_dealloc(extra); return -1; } - if (extra->size || extra->next) - return Tokenizer_emit_textbuffer(self, extra, 0); + if (extra->length > 0) + return Tokenizer_emit_textbuffer(self, extra); Textbuffer_dealloc(extra); return 0; } @@ -1232,7 +767,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global |= GL_HEADING; self->head += 1; - while (Tokenizer_READ(self, 0) == '=') { + while (Tokenizer_read(self, 0) == '=') { best++; self->head++; } @@ -1248,7 +783,11 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } - level = NEW_INT_FUNC(heading->level); +#ifdef IS_PY3K + level = PyLong_FromSsize_t(heading->level); +#else + level = PyInt_FromSsize_t(heading->level); +#endif if (!level) { Py_DECREF(heading->title); free(heading); @@ -1303,7 +842,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) self->head += 1; best = 1; - while (Tokenizer_READ(self, 0) == '=') { + while (Tokenizer_read(self, 0) == '=') { best++; self->head++; } @@ -1357,8 +896,8 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) */ static int Tokenizer_really_parse_entity(Tokenizer* self) { - PyObject *kwargs, *textobj; - Py_UNICODE this; + PyObject *kwargs, *charobj, *textobj; + Unicode this; int numeric, hexadecimal, i, j, zeroes, test; char *valid, *text, *buffer, *def; @@ -1371,7 +910,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) if (Tokenizer_emit(self, HTMLEntityStart)) return -1; self->head++; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { Tokenizer_fail_route(self); return 0; @@ -1381,7 +920,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) if (Tokenizer_emit(self, HTMLEntityNumeric)) return -1; self->head++; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { Tokenizer_fail_route(self); return 0; @@ -1391,7 +930,12 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) kwargs = PyDict_New(); if (!kwargs) return -1; - PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0)); + if (!(charobj = PyUnicode_FROM_SINGLE(this))) { + Py_DECREF(kwargs); + return -1; + } + PyDict_SetItemString(kwargs, "char", charobj); + Py_DECREF(charobj); if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs)) return -1; self->head++; @@ -1415,7 +959,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) i = 0; zeroes = 0; while (1) { - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (this == ';') { if (i == 0) FAIL_ROUTE_AND_EXIT() @@ -1528,21 +1072,21 @@ static int Tokenizer_parse_comment(Tokenizer* self) { Py_ssize_t reset = self->head + 3; PyObject *comment; - Py_UNICODE this; + Unicode this; self->head += 4; if (Tokenizer_push(self, 0)) return -1; while (1) { - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { comment = Tokenizer_pop(self); Py_XDECREF(comment); self->head = reset; return Tokenizer_emit_text(self, "\nfoobar\n}}" +output: [TemplateOpen(), CommentStart(), Text(text=" comment "), CommentEnd(), Text(text="\nfoobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), TemplateClose()] + +--- + +name: tag_in_link_title +label: HTML tags are invalid in link titles, even when complete +input: "[[foobarbaz]]" +output: [Text(text="[[foo"), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz]]")] + +--- + +name: tag_in_template_name +label: HTML tags are invalid in template names, even when complete +input: "{{foobarbaz}}" +output: [Text(text="{{foo"), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz}}")] + +--- + +name: tag_in_link_text +label: HTML tags are valid in link text +input: "[[foo|barbaz]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz"), WikilinkClose()] + +--- + +name: comment_in_link_title +label: comments are valid in link titles +input: "[[foobaz]]" +output: [WikilinkOpen(), Text(text="foo"), CommentStart(), Text(text="bar"), CommentEnd(), Text(text="baz"), WikilinkClose()] + +--- + +name: incomplete_comment_in_link_title +label: incomplete comments are invalid in link titles +input: "[[foo