@@ -5,6 +5,8 @@ python: | |||||
- 3.2 | - 3.2 | ||||
- 3.3 | - 3.3 | ||||
- 3.4 | - 3.4 | ||||
- 3.5-dev | |||||
sudo: false | |||||
install: | install: | ||||
- pip install coveralls | - pip install coveralls | ||||
- python setup.py build | - python setup.py build | ||||
@@ -1,4 +1,27 @@ | |||||
v0.4 (unreleased): | |||||
v0.4.1 (released July 30, 2015): | |||||
- The process for building Windows binaries has been fixed, and these should be | |||||
distributed along with new releases. Windows users can now take advantage of | |||||
C speedups without having a compiler of their own. | |||||
- Added support for Python 3.5. | |||||
- '<' and '>' are now disallowed in wikilink titles and template names. This | |||||
includes when denoting tags, but not comments. | |||||
- Fixed the behavior of preserve_spacing in Template.add() and keep_field in | |||||
Template.remove() on parameters with hidden keys. | |||||
- Removed _ListProxy.detach(). SmartLists now use weak references and their | |||||
children are garbage-collected properly. | |||||
- Fixed parser bugs involving: | |||||
- templates with completely blank names; | |||||
- templates with newlines and comments. | |||||
- Heavy refactoring and fixes to the C tokenizer, including: | |||||
- corrected a design flaw in text handling, allowing for substantial speed | |||||
improvements when parsing long strings of plain text; | |||||
- implemented new Python 3.3 PEP 393 Unicode APIs. | |||||
- Fixed various bugs in SmartList, including one that was causing memory issues | |||||
on 64-bit builds of Python 2 on Windows. | |||||
- Fixed some bugs in the release scripts. | |||||
v0.4 (released May 23, 2015): | |||||
- The parser now falls back on pure Python mode if C extensions cannot be | - The parser now falls back on pure Python mode if C extensions cannot be | ||||
built. This fixes an issue that prevented some Windows users from installing | built. This fixes an issue that prevented some Windows users from installing | ||||
@@ -139,7 +139,7 @@ If you're not using a library, you can parse any page using the following code | |||||
from urllib.parse import urlencode | from urllib.parse import urlencode | ||||
from urllib.request import urlopen | from urllib.request import urlopen | ||||
import mwparserfromhell | import mwparserfromhell | ||||
API_URL = "http://en.wikipedia.org/w/api.php" | |||||
API_URL = "https://en.wikipedia.org/w/api.php" | |||||
def parse(title): | def parse(title): | ||||
data = {"action": "query", "prop": "revisions", "rvlimit": 1, | data = {"action": "query", "prop": "revisions", "rvlimit": 1, | ||||
@@ -156,7 +156,6 @@ If you're not using a library, you can parse any page using the following code | |||||
.. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm | .. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm | ||||
.. _GitHub: https://github.com/earwig/mwparserfromhell | .. _GitHub: https://github.com/earwig/mwparserfromhell | ||||
.. _Python Package Index: http://pypi.python.org | .. _Python Package Index: http://pypi.python.org | ||||
.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat | |||||
.. _get pip: http://pypi.python.org/pypi/pip | .. _get pip: http://pypi.python.org/pypi/pip | ||||
.. _EarwigBot: https://github.com/earwig/earwigbot | .. _EarwigBot: https://github.com/earwig/earwigbot | ||||
.. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot | .. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot | ||||
@@ -0,0 +1,64 @@ | |||||
# This config file is used by appveyor.com to build Windows release binaries | |||||
version: 0.4.1-b{build} | |||||
branches: | |||||
only: | |||||
- master | |||||
skip_tags: true | |||||
environment: | |||||
global: | |||||
# See: http://stackoverflow.com/a/13751649/163740 | |||||
WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd" | |||||
PIP: "%WRAPPER% %PYTHON%\\Scripts\\pip.exe" | |||||
SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension" | |||||
PYPI_USERNAME: "earwigbot" | |||||
PYPI_PASSWORD: | |||||
secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+ | |||||
matrix: | |||||
- PYTHON: "C:\\Python27" | |||||
PYTHON_VERSION: "2.7" | |||||
PYTHON_ARCH: "32" | |||||
- PYTHON: "C:\\Python27-x64" | |||||
PYTHON_VERSION: "2.7" | |||||
PYTHON_ARCH: "64" | |||||
- PYTHON: "C:\\Python33" | |||||
PYTHON_VERSION: "3.3" | |||||
PYTHON_ARCH: "32" | |||||
- PYTHON: "C:\\Python33-x64" | |||||
PYTHON_VERSION: "3.3" | |||||
PYTHON_ARCH: "64" | |||||
- PYTHON: "C:\\Python34" | |||||
PYTHON_VERSION: "3.4" | |||||
PYTHON_ARCH: "32" | |||||
- PYTHON: "C:\\Python34-x64" | |||||
PYTHON_VERSION: "3.4" | |||||
PYTHON_ARCH: "64" | |||||
install: | |||||
- "%PIP% install wheel twine" | |||||
build_script: | |||||
- "%SETUPPY% build" | |||||
test_script: | |||||
- "%SETUPPY% -q test" | |||||
after_test: | |||||
- "%SETUPPY% bdist_wheel" | |||||
on_success: | |||||
- "twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%" | |||||
artifacts: | |||||
- path: dist\* | |||||
deploy: off |
@@ -1,11 +1,43 @@ | |||||
Changelog | Changelog | ||||
========= | ========= | ||||
v0.4.1 | |||||
------ | |||||
`Released July 30, 2015 <https://github.com/earwig/mwparserfromhell/tree/v0.4.1>`_ | |||||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4...v0.4.1>`__): | |||||
- The process for building Windows binaries has been fixed, and these should be | |||||
distributed along with new releases. Windows users can now take advantage of | |||||
C speedups without having a compiler of their own. | |||||
- Added support for Python 3.5. | |||||
- ``<`` and ``>`` are now disallowed in wikilink titles and template names. | |||||
This includes when denoting tags, but not comments. | |||||
- Fixed the behavior of *preserve_spacing* in :meth:`.Template.add` and | |||||
*keep_field* in :meth:`.Template.remove` on parameters with hidden keys. | |||||
- Removed :meth:`._ListProxy.detach`. :class:`.SmartList`\ s now use weak | |||||
references and their children are garbage-collected properly. | |||||
- Fixed parser bugs involving: | |||||
- templates with completely blank names; | |||||
- templates with newlines and comments. | |||||
- Heavy refactoring and fixes to the C tokenizer, including: | |||||
- corrected a design flaw in text handling, allowing for substantial speed | |||||
improvements when parsing long strings of plain text; | |||||
- implemented new Python 3.3 | |||||
`PEP 393 <https://www.python.org/dev/peps/pep-0393/>`_ Unicode APIs. | |||||
- Fixed various bugs in :class:`.SmartList`, including one that was causing | |||||
memory issues on 64-bit builds of Python 2 on Windows. | |||||
- Fixed some bugs in the release scripts. | |||||
v0.4 | v0.4 | ||||
---- | ---- | ||||
Unreleased | |||||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.3.3...develop>`__): | |||||
`Released May 23, 2015 <https://github.com/earwig/mwparserfromhell/tree/v0.4>`_ | |||||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.3.3...v0.4>`__): | |||||
- The parser now falls back on pure Python mode if C extensions cannot be | - The parser now falls back on pure Python mode if C extensions cannot be | ||||
built. This fixes an issue that prevented some Windows users from installing | built. This fixes an issue that prevented some Windows users from installing | ||||
@@ -17,24 +17,22 @@ Development occurs on GitHub_. | |||||
Installation | Installation | ||||
------------ | ------------ | ||||
The easiest way to install the parser is through the `Python Package Index`_, | |||||
so you can install the latest release with ``pip install mwparserfromhell`` | |||||
(`get pip`_). Alternatively, get the latest development version:: | |||||
The easiest way to install the parser is through the `Python Package Index`_; | |||||
you can install the latest release with ``pip install mwparserfromhell`` | |||||
(`get pip`_). On Windows, make sure you have the latest version of pip | |||||
installed by running ``pip install --upgrade pip``. | |||||
Alternatively, get the latest development version:: | |||||
git clone https://github.com/earwig/mwparserfromhell.git | git clone https://github.com/earwig/mwparserfromhell.git | ||||
cd mwparserfromhell | cd mwparserfromhell | ||||
python setup.py install | python setup.py install | ||||
If you get ``error: Unable to find vcvarsall.bat`` while installing, this is | |||||
because Windows can't find the compiler for C extensions. Consult this | |||||
`StackOverflow question`_ for help. You can also set ``ext_modules`` in | |||||
``setup.py`` to an empty list to prevent the extension from building. | |||||
You can run the comprehensive unit testing suite with ``python setup.py test``. | |||||
You can run the comprehensive unit testing suite with | |||||
``python setup.py test -q``. | |||||
.. _Python Package Index: http://pypi.python.org | .. _Python Package Index: http://pypi.python.org | ||||
.. _get pip: http://pypi.python.org/pypi/pip | .. _get pip: http://pypi.python.org/pypi/pip | ||||
.. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat | |||||
Contents | Contents | ||||
-------- | -------- | ||||
@@ -25,7 +25,7 @@ If you're not using a library, you can parse any page using the following code | |||||
from urllib.parse import urlencode | from urllib.parse import urlencode | ||||
from urllib.request import urlopen | from urllib.request import urlopen | ||||
import mwparserfromhell | import mwparserfromhell | ||||
API_URL = "http://en.wikipedia.org/w/api.php" | |||||
API_URL = "https://en.wikipedia.org/w/api.php" | |||||
def parse(title): | def parse(title): | ||||
data = {"action": "query", "prop": "revisions", "rvlimit": 1, | data = {"action": "query", "prop": "revisions", "rvlimit": 1, | ||||
@@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode. | |||||
__author__ = "Ben Kurtovic" | __author__ = "Ben Kurtovic" | ||||
__copyright__ = "Copyright (C) 2012, 2013, 2014, 2015 Ben Kurtovic" | __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015 Ben Kurtovic" | ||||
__license__ = "MIT License" | __license__ = "MIT License" | ||||
__version__ = "0.4" | |||||
__version__ = "0.4.1" | |||||
__email__ = "ben.kurtovic@gmail.com" | __email__ = "ben.kurtovic@gmail.com" | ||||
from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | ||||
@@ -18,14 +18,12 @@ if py3k: | |||||
bytes = bytes | bytes = bytes | ||||
str = str | str = str | ||||
range = range | range = range | ||||
maxsize = sys.maxsize | |||||
import html.entities as htmlentities | import html.entities as htmlentities | ||||
else: | else: | ||||
bytes = str | bytes = str | ||||
str = unicode | str = unicode | ||||
range = xrange | range = xrange | ||||
maxsize = sys.maxint | |||||
import htmlentitydefs as htmlentities | import htmlentitydefs as htmlentities | ||||
del sys | del sys |
@@ -81,10 +81,8 @@ def is_single_only(tag): | |||||
"""Return whether or not the given *tag* must exist without a close tag.""" | """Return whether or not the given *tag* must exist without a close tag.""" | ||||
return tag.lower() in SINGLE_ONLY | return tag.lower() in SINGLE_ONLY | ||||
def is_scheme(scheme, slashes=True, reverse=False): | |||||
def is_scheme(scheme, slashes=True): | |||||
"""Return whether *scheme* is valid for external links.""" | """Return whether *scheme* is valid for external links.""" | ||||
if reverse: # Convenience for C | |||||
scheme = scheme[::-1] | |||||
scheme = scheme.lower() | scheme = scheme.lower() | ||||
if slashes: | if slashes: | ||||
return scheme in URI_SCHEMES | return scheme in URI_SCHEMES | ||||
@@ -82,21 +82,11 @@ class Template(Node): | |||||
if char in node: | if char in node: | ||||
code.replace(node, node.replace(char, replacement), False) | code.replace(node, node.replace(char, replacement), False) | ||||
def _blank_param_value(self, value): | |||||
"""Remove the content from *value* while keeping its whitespace. | |||||
Replace *value*\ 's nodes with two text nodes, the first containing | |||||
whitespace from before its content and the second containing whitespace | |||||
from after its content. | |||||
""" | |||||
match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS) | |||||
value.nodes = [Text(match.group(1)), Text(match.group(2))] | |||||
def _select_theory(self, theories): | def _select_theory(self, theories): | ||||
"""Return the most likely spacing convention given different options. | """Return the most likely spacing convention given different options. | ||||
Given a dictionary of convention options as keys and their occurrence as | |||||
values, return the convention that occurs the most, or ``None`` if | |||||
Given a dictionary of convention options as keys and their occurrence | |||||
as values, return the convention that occurs the most, or ``None`` if | |||||
there is no clear preferred style. | there is no clear preferred style. | ||||
""" | """ | ||||
if theories: | if theories: | ||||
@@ -129,34 +119,47 @@ class Template(Node): | |||||
after = self._select_theory(after_theories) | after = self._select_theory(after_theories) | ||||
return before, after | return before, after | ||||
def _remove_with_field(self, param, i, name): | |||||
"""Return True if a parameter name should be kept, otherwise False.""" | |||||
if param.showkey: | |||||
following = self.params[i+1:] | |||||
better_matches = [after.name.strip() == name and not after.showkey for after in following] | |||||
if any(better_matches): | |||||
return False | |||||
return True | |||||
def _remove_without_field(self, param, i): | |||||
"""Return False if a parameter name should be kept, otherwise True.""" | |||||
if not param.showkey: | |||||
dependents = [not after.showkey for after in self.params[i+1:]] | |||||
if any(dependents): | |||||
return False | |||||
return True | |||||
def _blank_param_value(self, value): | |||||
"""Remove the content from *value* while keeping its whitespace. | |||||
Replace *value*\ 's nodes with two text nodes, the first containing | |||||
whitespace from before its content and the second containing whitespace | |||||
from after its content. | |||||
""" | |||||
match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS) | |||||
value.nodes = [Text(match.group(1)), Text(match.group(2))] | |||||
def _fix_dependendent_params(self, i): | |||||
"""Unhide keys if necessary after removing the param at index *i*.""" | |||||
if not self.params[i].showkey: | |||||
for param in self.params[i + 1:]: | |||||
if not param.showkey: | |||||
param.showkey = True | |||||
def _remove_exact(self, needle, keep_field): | def _remove_exact(self, needle, keep_field): | ||||
"""Remove a specific parameter, *needle*, from the template.""" | """Remove a specific parameter, *needle*, from the template.""" | ||||
for i, param in enumerate(self.params): | for i, param in enumerate(self.params): | ||||
if param is needle: | if param is needle: | ||||
if keep_field or not self._remove_without_field(param, i): | |||||
if keep_field: | |||||
self._blank_param_value(param.value) | self._blank_param_value(param.value) | ||||
else: | else: | ||||
self._fix_dependendent_params(i) | |||||
self.params.pop(i) | self.params.pop(i) | ||||
return | return | ||||
raise ValueError(needle) | raise ValueError(needle) | ||||
def _should_remove(self, i, name): | |||||
"""Look ahead for a parameter with the same name, but hidden. | |||||
If one exists, we should remove the given one rather than blanking it. | |||||
""" | |||||
if self.params[i].showkey: | |||||
following = self.params[i + 1:] | |||||
better_matches = [after.name.strip() == name and not after.showkey | |||||
for after in following] | |||||
return any(better_matches) | |||||
return False | |||||
@property | @property | ||||
def name(self): | def name(self): | ||||
"""The name of the template, as a :class:`.Wikicode` object.""" | """The name of the template, as a :class:`.Wikicode` object.""" | ||||
@@ -213,26 +216,25 @@ class Template(Node): | |||||
:func:`.utils.parse_anything`; pipes and equal signs are automatically | :func:`.utils.parse_anything`; pipes and equal signs are automatically | ||||
escaped from *value* when appropriate. | escaped from *value* when appropriate. | ||||
If *name* is already a parameter in the template, we'll replace its | |||||
value. | |||||
If *showkey* is given, this will determine whether or not to show the | If *showkey* is given, this will determine whether or not to show the | ||||
parameter's name (e.g., ``{{foo|bar}}``'s parameter has a name of | parameter's name (e.g., ``{{foo|bar}}``'s parameter has a name of | ||||
``"1"`` but it is hidden); otherwise, we'll make a safe and intelligent | ``"1"`` but it is hidden); otherwise, we'll make a safe and intelligent | ||||
guess. | guess. | ||||
If *name* is already a parameter in the template, we'll replace its | |||||
value while keeping the same whitespace around it. We will also try to | |||||
guess the dominant spacing convention when adding a new parameter using | |||||
:meth:`_get_spacing_conventions`. | |||||
If *before* is given (either a :class:`.Parameter` object or a name), | If *before* is given (either a :class:`.Parameter` object or a name), | ||||
then we will place the parameter immediately before this one. | then we will place the parameter immediately before this one. | ||||
Otherwise, it will be added at the end. If *before* is a name and | Otherwise, it will be added at the end. If *before* is a name and | ||||
exists multiple times in the template, we will place it before the last | exists multiple times in the template, we will place it before the last | ||||
occurrence. If *before* is not in the template, :exc:`ValueError` is | occurrence. If *before* is not in the template, :exc:`ValueError` is | ||||
raised. The argument is ignored if the new parameter already exists. | |||||
raised. The argument is ignored if *name* is an existing parameter. | |||||
If *preserve_spacing* is ``False``, we will avoid preserving spacing | |||||
conventions when changing the value of an existing parameter or when | |||||
adding a new one. | |||||
If *preserve_spacing* is ``True``, we will try to preserve whitespace | |||||
conventions around the parameter, whether it is new or we are updating | |||||
an existing value. It is disabled for parameters with hidden keys, | |||||
since MediaWiki doesn't strip whitespace in this case. | |||||
""" | """ | ||||
name, value = parse_anything(name), parse_anything(value) | name, value = parse_anything(name), parse_anything(value) | ||||
self._surface_escape(value, "|") | self._surface_escape(value, "|") | ||||
@@ -245,7 +247,7 @@ class Template(Node): | |||||
if not existing.showkey: | if not existing.showkey: | ||||
self._surface_escape(value, "=") | self._surface_escape(value, "=") | ||||
nodes = existing.value.nodes | nodes = existing.value.nodes | ||||
if preserve_spacing: | |||||
if preserve_spacing and existing.showkey: | |||||
for i in range(2): # Ignore empty text nodes | for i in range(2): # Ignore empty text nodes | ||||
if not nodes[i]: | if not nodes[i]: | ||||
nodes[i] = None | nodes[i] = None | ||||
@@ -271,7 +273,7 @@ class Template(Node): | |||||
if not showkey: | if not showkey: | ||||
self._surface_escape(value, "=") | self._surface_escape(value, "=") | ||||
if preserve_spacing: | |||||
if preserve_spacing and showkey: | |||||
before_n, after_n = self._get_spacing_conventions(use_names=True) | before_n, after_n = self._get_spacing_conventions(use_names=True) | ||||
before_v, after_v = self._get_spacing_conventions(use_names=False) | before_v, after_v = self._get_spacing_conventions(use_names=False) | ||||
name = parse_anything([before_n, name, after_n]) | name = parse_anything([before_n, name, after_n]) | ||||
@@ -294,36 +296,39 @@ class Template(Node): | |||||
and :meth:`get`. | and :meth:`get`. | ||||
If *keep_field* is ``True``, we will keep the parameter's name, but | If *keep_field* is ``True``, we will keep the parameter's name, but | ||||
blank its value. Otherwise, we will remove the parameter completely | |||||
*unless* other parameters are dependent on it (e.g. removing ``bar`` | |||||
from ``{{foo|bar|baz}}`` is unsafe because ``{{foo|baz}}`` is not what | |||||
we expected, so ``{{foo||baz}}`` will be produced instead). | |||||
blank its value. Otherwise, we will remove the parameter completely. | |||||
When removing a parameter with a hidden name, subsequent parameters | |||||
with hidden names will be made visible. For example, removing ``bar`` | |||||
from ``{{foo|bar|baz}}`` produces ``{{foo|2=baz}}`` because | |||||
``{{foo|baz}}`` is incorrect. | |||||
If the parameter shows up multiple times in the template and *param* is | If the parameter shows up multiple times in the template and *param* is | ||||
not a :class:`.Parameter` object, we will remove all instances of it | not a :class:`.Parameter` object, we will remove all instances of it | ||||
(and keep only one if *keep_field* is ``True`` - the first instance if | |||||
none have dependents, otherwise the one with dependents will be kept). | |||||
(and keep only one if *keep_field* is ``True`` - either the one with a | |||||
hidden name, if it exists, or the first instance). | |||||
""" | """ | ||||
if isinstance(param, Parameter): | if isinstance(param, Parameter): | ||||
return self._remove_exact(param, keep_field) | return self._remove_exact(param, keep_field) | ||||
name = str(param).strip() | name = str(param).strip() | ||||
removed = False | removed = False | ||||
to_remove = [] | to_remove = [] | ||||
for i, param in enumerate(self.params): | for i, param in enumerate(self.params): | ||||
if param.name.strip() == name: | if param.name.strip() == name: | ||||
if keep_field: | if keep_field: | ||||
if self._remove_with_field(param, i, name): | |||||
self._blank_param_value(param.value) | |||||
keep_field = False | |||||
else: | |||||
to_remove.append(i) | |||||
else: | |||||
if self._remove_without_field(param, i): | |||||
if self._should_remove(i, name): | |||||
to_remove.append(i) | to_remove.append(i) | ||||
else: | else: | ||||
self._blank_param_value(param.value) | self._blank_param_value(param.value) | ||||
keep_field = False | |||||
else: | |||||
self._fix_dependendent_params(i) | |||||
to_remove.append(i) | |||||
if not removed: | if not removed: | ||||
removed = True | removed = True | ||||
if not removed: | if not removed: | ||||
raise ValueError(name) | raise ValueError(name) | ||||
for i in reversed(to_remove): | for i in reversed(to_remove): | ||||
@@ -40,11 +40,11 @@ class ParserError(Exception): | |||||
from .builder import Builder | from .builder import Builder | ||||
from .tokenizer import Tokenizer | |||||
try: | try: | ||||
from ._tokenizer import CTokenizer | from ._tokenizer import CTokenizer | ||||
use_c = True | use_c = True | ||||
except ImportError: | except ImportError: | ||||
from .tokenizer import Tokenizer | |||||
CTokenizer = None | CTokenizer = None | ||||
use_c = False | use_c = False | ||||
@@ -70,6 +70,7 @@ class Parser(object): | |||||
if use_c and CTokenizer: | if use_c and CTokenizer: | ||||
self._tokenizer = CTokenizer() | self._tokenizer = CTokenizer() | ||||
else: | else: | ||||
from .tokenizer import Tokenizer | |||||
self._tokenizer = Tokenizer() | self._tokenizer = Tokenizer() | ||||
self._builder = Builder() | self._builder = Builder() | ||||
@@ -89,6 +89,7 @@ Local (stack-specific) contexts: | |||||
* :const:`FAIL_ON_LBRACE` | * :const:`FAIL_ON_LBRACE` | ||||
* :const:`FAIL_ON_RBRACE` | * :const:`FAIL_ON_RBRACE` | ||||
* :const:`FAIL_ON_EQUALS` | * :const:`FAIL_ON_EQUALS` | ||||
* :const:`HAS_TEMPLATE` | |||||
* :const:`TABLE` | * :const:`TABLE` | ||||
@@ -161,15 +162,16 @@ FAIL_NEXT = 1 << 26 | |||||
FAIL_ON_LBRACE = 1 << 27 | FAIL_ON_LBRACE = 1 << 27 | ||||
FAIL_ON_RBRACE = 1 << 28 | FAIL_ON_RBRACE = 1 << 28 | ||||
FAIL_ON_EQUALS = 1 << 29 | FAIL_ON_EQUALS = 1 << 29 | ||||
HAS_TEMPLATE = 1 << 30 | |||||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | ||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | |||||
TABLE_OPEN = 1 << 30 | |||||
TABLE_CELL_OPEN = 1 << 31 | |||||
TABLE_CELL_STYLE = 1 << 32 | |||||
TABLE_ROW_OPEN = 1 << 33 | |||||
TABLE_TD_LINE = 1 << 34 | |||||
TABLE_TH_LINE = 1 << 35 | |||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE) | |||||
TABLE_OPEN = 1 << 31 | |||||
TABLE_CELL_OPEN = 1 << 32 | |||||
TABLE_CELL_STYLE = 1 << 33 | |||||
TABLE_ROW_OPEN = 1 << 34 | |||||
TABLE_TD_LINE = 1 << 35 | |||||
TABLE_TH_LINE = 1 << 36 | |||||
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | ||||
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | ||||
TABLE_TD_LINE + TABLE_TH_LINE) | TABLE_TD_LINE + TABLE_TH_LINE) | ||||
@@ -0,0 +1,125 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#ifndef PY_SSIZE_T_CLEAN | |||||
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html | |||||
#endif | |||||
#include <Python.h> | |||||
#include <structmember.h> | |||||
#include <bytesobject.h> | |||||
/* Compatibility macros */ | |||||
#if PY_MAJOR_VERSION >= 3 | |||||
#define IS_PY3K | |||||
#endif | |||||
#ifndef uint64_t | |||||
#define uint64_t unsigned PY_LONG_LONG | |||||
#endif | |||||
#define malloc PyObject_Malloc // XXX: yuck | |||||
#define realloc PyObject_Realloc | |||||
#define free PyObject_Free | |||||
/* Unicode support macros */ | |||||
#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3 | |||||
#define PEP_393 | |||||
#endif | |||||
#ifdef PEP_393 | |||||
#define Unicode Py_UCS4 | |||||
#define PyUnicode_FROM_SINGLE(chr) \ | |||||
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) | |||||
#else | |||||
#define Unicode Py_UNICODE | |||||
#define PyUnicode_FROM_SINGLE(chr) \ | |||||
PyUnicode_FromUnicode(&(chr), 1) | |||||
#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE | |||||
#endif | |||||
/* Error handling macros */ | |||||
#define BAD_ROUTE self->route_state | |||||
#define BAD_ROUTE_CONTEXT self->route_context | |||||
#define FAIL_ROUTE(context) { \ | |||||
self->route_state = 1; \ | |||||
self->route_context = context; \ | |||||
} | |||||
#define RESET_ROUTE() self->route_state = 0 | |||||
/* Shared globals */ | |||||
extern char** entitydefs; | |||||
extern PyObject* NOARGS; | |||||
extern PyObject* definitions; | |||||
/* Structs */ | |||||
typedef struct { | |||||
Py_ssize_t capacity; | |||||
Py_ssize_t length; | |||||
#ifdef PEP_393 | |||||
PyObject* object; | |||||
int kind; | |||||
void* data; | |||||
#else | |||||
Py_UNICODE* data; | |||||
#endif | |||||
} Textbuffer; | |||||
struct Stack { | |||||
PyObject* stack; | |||||
uint64_t context; | |||||
Textbuffer* textbuffer; | |||||
struct Stack* next; | |||||
}; | |||||
typedef struct Stack Stack; | |||||
typedef struct { | |||||
PyObject* object; /* base PyUnicodeObject object */ | |||||
Py_ssize_t length; /* length of object, in code points */ | |||||
#ifdef PEP_393 | |||||
int kind; /* object's kind value */ | |||||
void* data; /* object's raw unicode buffer */ | |||||
#else | |||||
Py_UNICODE* buf; /* object's internal buffer */ | |||||
#endif | |||||
} TokenizerInput; | |||||
typedef struct { | |||||
PyObject_HEAD | |||||
TokenizerInput text; /* text to tokenize */ | |||||
Stack* topstack; /* topmost stack */ | |||||
Py_ssize_t head; /* current position in text */ | |||||
int global; /* global context */ | |||||
int depth; /* stack recursion depth */ | |||||
int cycles; /* total number of stack recursions */ | |||||
int route_state; /* whether a BadRoute has been triggered */ | |||||
uint64_t route_context; /* context when the last BadRoute was triggered */ | |||||
int skip_style_tags; /* temp fix for the sometimes broken tag parser */ | |||||
} Tokenizer; |
@@ -0,0 +1,105 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
/* Local contexts */ | |||||
#define LC_TEMPLATE 0x0000000000000007 | |||||
#define LC_TEMPLATE_NAME 0x0000000000000001 | |||||
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 | |||||
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 | |||||
#define LC_ARGUMENT 0x0000000000000018 | |||||
#define LC_ARGUMENT_NAME 0x0000000000000008 | |||||
#define LC_ARGUMENT_DEFAULT 0x0000000000000010 | |||||
#define LC_WIKILINK 0x0000000000000060 | |||||
#define LC_WIKILINK_TITLE 0x0000000000000020 | |||||
#define LC_WIKILINK_TEXT 0x0000000000000040 | |||||
#define LC_EXT_LINK 0x0000000000000180 | |||||
#define LC_EXT_LINK_URI 0x0000000000000080 | |||||
#define LC_EXT_LINK_TITLE 0x0000000000000100 | |||||
#define LC_HEADING 0x0000000000007E00 | |||||
#define LC_HEADING_LEVEL_1 0x0000000000000200 | |||||
#define LC_HEADING_LEVEL_2 0x0000000000000400 | |||||
#define LC_HEADING_LEVEL_3 0x0000000000000800 | |||||
#define LC_HEADING_LEVEL_4 0x0000000000001000 | |||||
#define LC_HEADING_LEVEL_5 0x0000000000002000 | |||||
#define LC_HEADING_LEVEL_6 0x0000000000004000 | |||||
#define LC_TAG 0x0000000000078000 | |||||
#define LC_TAG_OPEN 0x0000000000008000 | |||||
#define LC_TAG_ATTR 0x0000000000010000 | |||||
#define LC_TAG_BODY 0x0000000000020000 | |||||
#define LC_TAG_CLOSE 0x0000000000040000 | |||||
#define LC_STYLE 0x0000000000780000 | |||||
#define LC_STYLE_ITALICS 0x0000000000080000 | |||||
#define LC_STYLE_BOLD 0x0000000000100000 | |||||
#define LC_STYLE_PASS_AGAIN 0x0000000000200000 | |||||
#define LC_STYLE_SECOND_PASS 0x0000000000400000 | |||||
#define LC_DLTERM 0x0000000000800000 | |||||
#define LC_SAFETY_CHECK 0x000000007F000000 | |||||
#define LC_HAS_TEXT 0x0000000001000000 | |||||
#define LC_FAIL_ON_TEXT 0x0000000002000000 | |||||
#define LC_FAIL_NEXT 0x0000000004000000 | |||||
#define LC_FAIL_ON_LBRACE 0x0000000008000000 | |||||
#define LC_FAIL_ON_RBRACE 0x0000000010000000 | |||||
#define LC_FAIL_ON_EQUALS 0x0000000020000000 | |||||
#define LC_HAS_TEMPLATE 0x0000000040000000 | |||||
#define LC_TABLE 0x0000001F80000000 | |||||
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000 | |||||
#define LC_TABLE_OPEN 0x0000000080000000 | |||||
#define LC_TABLE_CELL_OPEN 0x0000000100000000 | |||||
#define LC_TABLE_CELL_STYLE 0x0000000200000000 | |||||
#define LC_TABLE_ROW_OPEN 0x0000000400000000 | |||||
#define LC_TABLE_TD_LINE 0x0000000800000000 | |||||
#define LC_TABLE_TH_LINE 0x0000001000000000 | |||||
/* Global contexts */ | |||||
#define GL_HEADING 0x1 | |||||
/* Aggregate contexts */ | |||||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) | |||||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) | |||||
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | |||||
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | |||||
/* Tag contexts */ | |||||
#define TAG_NAME 0x01 | |||||
#define TAG_ATTR_READY 0x02 | |||||
#define TAG_ATTR_NAME 0x04 | |||||
#define TAG_ATTR_VALUE 0x08 | |||||
#define TAG_QUOTED 0x10 | |||||
#define TAG_NOTE_SPACE 0x20 | |||||
#define TAG_NOTE_EQUALS 0x40 | |||||
#define TAG_NOTE_QUOTE 0x80 |
@@ -0,0 +1,78 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#include "tag_data.h" | |||||
#include "contexts.h" | |||||
/* | |||||
Initialize a new TagData object. | |||||
*/ | |||||
TagData* TagData_new(TokenizerInput* text) | |||||
{ | |||||
#define ALLOC_BUFFER(name) \ | |||||
name = Textbuffer_new(text); \ | |||||
if (!name) { \ | |||||
TagData_dealloc(self); \ | |||||
return NULL; \ | |||||
} | |||||
TagData *self = malloc(sizeof(TagData)); | |||||
if (!self) { | |||||
PyErr_NoMemory(); | |||||
return NULL; | |||||
} | |||||
self->context = TAG_NAME; | |||||
ALLOC_BUFFER(self->pad_first) | |||||
ALLOC_BUFFER(self->pad_before_eq) | |||||
ALLOC_BUFFER(self->pad_after_eq) | |||||
self->quoter = 0; | |||||
self->reset = 0; | |||||
return self; | |||||
#undef ALLOC_BUFFER | |||||
} | |||||
/* | |||||
Deallocate the given TagData object. | |||||
*/ | |||||
void TagData_dealloc(TagData* self) | |||||
{ | |||||
if (self->pad_first) | |||||
Textbuffer_dealloc(self->pad_first); | |||||
if (self->pad_before_eq) | |||||
Textbuffer_dealloc(self->pad_before_eq); | |||||
if (self->pad_after_eq) | |||||
Textbuffer_dealloc(self->pad_after_eq); | |||||
free(self); | |||||
} | |||||
/* | |||||
Clear the internal buffers of the given TagData object. | |||||
*/ | |||||
int TagData_reset_buffers(TagData* self) | |||||
{ | |||||
if (Textbuffer_reset(self->pad_first) || | |||||
Textbuffer_reset(self->pad_before_eq) || | |||||
Textbuffer_reset(self->pad_after_eq)) | |||||
return -1; | |||||
return 0; | |||||
} |
@@ -0,0 +1,43 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#include "common.h" | |||||
#include "textbuffer.h" | |||||
/* Structs */ | |||||
typedef struct { | |||||
uint64_t context; | |||||
Textbuffer* pad_first; | |||||
Textbuffer* pad_before_eq; | |||||
Textbuffer* pad_after_eq; | |||||
Unicode quoter; | |||||
Py_ssize_t reset; | |||||
} TagData; | |||||
/* Functions */ | |||||
TagData* TagData_new(TokenizerInput*); | |||||
void TagData_dealloc(TagData*); | |||||
int TagData_reset_buffers(TagData*); |
@@ -0,0 +1,232 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#include "textbuffer.h" | |||||
#define INITIAL_CAPACITY 32 | |||||
#define RESIZE_FACTOR 2 | |||||
#define CONCAT_EXTRA 32 | |||||
/* | |||||
Internal allocation function for textbuffers. | |||||
*/ | |||||
static int internal_alloc(Textbuffer* self, Unicode maxchar) | |||||
{ | |||||
self->capacity = INITIAL_CAPACITY; | |||||
self->length = 0; | |||||
#ifdef PEP_393 | |||||
self->object = PyUnicode_New(self->capacity, maxchar); | |||||
if (!self->object) | |||||
return -1; | |||||
self->kind = PyUnicode_KIND(self->object); | |||||
self->data = PyUnicode_DATA(self->object); | |||||
#else | |||||
(void) maxchar; // Unused | |||||
self->data = malloc(sizeof(Unicode) * self->capacity); | |||||
if (!self->data) | |||||
return -1; | |||||
#endif | |||||
return 0; | |||||
} | |||||
/* | |||||
Internal deallocation function for textbuffers. | |||||
*/ | |||||
static void internal_dealloc(Textbuffer* self) | |||||
{ | |||||
#ifdef PEP_393 | |||||
Py_DECREF(self->object); | |||||
#else | |||||
free(self->data); | |||||
#endif | |||||
} | |||||
/* | |||||
Internal resize function. | |||||
*/ | |||||
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) | |||||
{ | |||||
#ifdef PEP_393 | |||||
PyObject *newobj; | |||||
void *newdata; | |||||
newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object)); | |||||
if (!newobj) | |||||
return -1; | |||||
newdata = PyUnicode_DATA(newobj); | |||||
memcpy(newdata, self->data, self->length * self->kind); | |||||
Py_DECREF(self->object); | |||||
self->object = newobj; | |||||
self->data = newdata; | |||||
#else | |||||
if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap))) | |||||
return -1; | |||||
#endif | |||||
self->capacity = new_cap; | |||||
return 0; | |||||
} | |||||
/* | |||||
Create a new textbuffer object. | |||||
*/ | |||||
Textbuffer* Textbuffer_new(TokenizerInput* text) | |||||
{ | |||||
Textbuffer* self = malloc(sizeof(Textbuffer)); | |||||
Unicode maxchar = 0; | |||||
#ifdef PEP_393 | |||||
maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); | |||||
#endif | |||||
if (!self) | |||||
goto fail_nomem; | |||||
if (internal_alloc(self, maxchar) < 0) | |||||
goto fail_dealloc; | |||||
return self; | |||||
fail_dealloc: | |||||
free(self); | |||||
fail_nomem: | |||||
PyErr_NoMemory(); | |||||
return NULL; | |||||
} | |||||
/* | |||||
Deallocate the given textbuffer. | |||||
*/ | |||||
void Textbuffer_dealloc(Textbuffer* self) | |||||
{ | |||||
internal_dealloc(self); | |||||
free(self); | |||||
} | |||||
/* | |||||
Reset a textbuffer to its initial, empty state. | |||||
*/ | |||||
int Textbuffer_reset(Textbuffer* self) | |||||
{ | |||||
Unicode maxchar = 0; | |||||
#ifdef PEP_393 | |||||
maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); | |||||
#endif | |||||
internal_dealloc(self); | |||||
if (internal_alloc(self, maxchar)) | |||||
return -1; | |||||
return 0; | |||||
} | |||||
/* | |||||
Write a Unicode codepoint to the given textbuffer. | |||||
*/ | |||||
int Textbuffer_write(Textbuffer* self, Unicode code) | |||||
{ | |||||
if (self->length >= self->capacity) { | |||||
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) | |||||
return -1; | |||||
} | |||||
#ifdef PEP_393 | |||||
PyUnicode_WRITE(self->kind, self->data, self->length++, code); | |||||
#else | |||||
self->data[self->length++] = code; | |||||
#endif | |||||
return 0; | |||||
} | |||||
/* | |||||
Read a Unicode codepoint from the given index of the given textbuffer. | |||||
This function does not check for bounds. | |||||
*/ | |||||
Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index) | |||||
{ | |||||
#ifdef PEP_393 | |||||
return PyUnicode_READ(self->kind, self->data, index); | |||||
#else | |||||
return self->data[index]; | |||||
#endif | |||||
} | |||||
/* | |||||
Return the contents of the textbuffer as a Python Unicode object. | |||||
*/ | |||||
PyObject* Textbuffer_render(Textbuffer* self) | |||||
{ | |||||
#ifdef PEP_393 | |||||
return PyUnicode_FromKindAndData(self->kind, self->data, self->length); | |||||
#else | |||||
return PyUnicode_FromUnicode(self->data, self->length); | |||||
#endif | |||||
} | |||||
/* | |||||
Concatenate the 'other' textbuffer onto the end of the given textbuffer. | |||||
*/ | |||||
int Textbuffer_concat(Textbuffer* self, Textbuffer* other) | |||||
{ | |||||
Py_ssize_t newlen = self->length + other->length; | |||||
if (newlen > self->capacity) { | |||||
if (internal_resize(self, newlen + CONCAT_EXTRA) < 0) | |||||
return -1; | |||||
} | |||||
#ifdef PEP_393 | |||||
assert(self->kind == other->kind); | |||||
memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data, | |||||
other->length * other->kind); | |||||
#else | |||||
memcpy(self->data + self->length, other->data, | |||||
other->length * sizeof(Unicode)); | |||||
#endif | |||||
self->length = newlen; | |||||
return 0; | |||||
} | |||||
/* | |||||
Reverse the contents of the given textbuffer. | |||||
*/ | |||||
void Textbuffer_reverse(Textbuffer* self) | |||||
{ | |||||
Py_ssize_t i, end = self->length - 1; | |||||
Unicode tmp; | |||||
for (i = 0; i < self->length / 2; i++) { | |||||
#ifdef PEP_393 | |||||
tmp = PyUnicode_READ(self->kind, self->data, i); | |||||
PyUnicode_WRITE(self->kind, self->data, i, | |||||
PyUnicode_READ(self->kind, self->data, end - i)); | |||||
PyUnicode_WRITE(self->kind, self->data, end - i, tmp); | |||||
#else | |||||
tmp = self->data[i]; | |||||
self->data[i] = self->data[end - i]; | |||||
self->data[end - i] = tmp; | |||||
#endif | |||||
} | |||||
} |
@@ -0,0 +1,36 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#include "common.h" | |||||
/* Functions */ | |||||
Textbuffer* Textbuffer_new(TokenizerInput*); | |||||
void Textbuffer_dealloc(Textbuffer*); | |||||
int Textbuffer_reset(Textbuffer*); | |||||
int Textbuffer_write(Textbuffer*, Unicode); | |||||
Unicode Textbuffer_read(Textbuffer*, Py_ssize_t); | |||||
PyObject* Textbuffer_render(Textbuffer*); | |||||
int Textbuffer_concat(Textbuffer*, Textbuffer*); | |||||
void Textbuffer_reverse(Textbuffer*); |
@@ -0,0 +1,35 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#include "common.h" | |||||
static const char MARKERS[] = { | |||||
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', | |||||
'-', '!', '\n', '\0'}; | |||||
#define NUM_MARKERS 19 | |||||
/* Functions */ | |||||
PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); |
@@ -0,0 +1,345 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#include "tok_support.h" | |||||
#include "textbuffer.h" | |||||
#include "tokens.h" | |||||
/* | |||||
Add a new token stack, context, and textbuffer to the list. | |||||
*/ | |||||
int Tokenizer_push(Tokenizer* self, uint64_t context) | |||||
{ | |||||
Stack* top = malloc(sizeof(Stack)); | |||||
if (!top) { | |||||
PyErr_NoMemory(); | |||||
return -1; | |||||
} | |||||
top->stack = PyList_New(0); | |||||
top->context = context; | |||||
top->textbuffer = Textbuffer_new(&self->text); | |||||
if (!top->textbuffer) | |||||
return -1; | |||||
top->next = self->topstack; | |||||
self->topstack = top; | |||||
self->depth++; | |||||
self->cycles++; | |||||
return 0; | |||||
} | |||||
/* | |||||
Push the textbuffer onto the stack as a Text node and clear it. | |||||
*/ | |||||
int Tokenizer_push_textbuffer(Tokenizer* self) | |||||
{ | |||||
PyObject *text, *kwargs, *token; | |||||
Textbuffer* buffer = self->topstack->textbuffer; | |||||
if (buffer->length == 0) | |||||
return 0; | |||||
text = Textbuffer_render(buffer); | |||||
if (!text) | |||||
return -1; | |||||
kwargs = PyDict_New(); | |||||
if (!kwargs) { | |||||
Py_DECREF(text); | |||||
return -1; | |||||
} | |||||
PyDict_SetItemString(kwargs, "text", text); | |||||
Py_DECREF(text); | |||||
token = PyObject_Call(Text, NOARGS, kwargs); | |||||
Py_DECREF(kwargs); | |||||
if (!token) | |||||
return -1; | |||||
if (PyList_Append(self->topstack->stack, token)) { | |||||
Py_DECREF(token); | |||||
return -1; | |||||
} | |||||
Py_DECREF(token); | |||||
if (Textbuffer_reset(buffer)) | |||||
return -1; | |||||
return 0; | |||||
} | |||||
/* | |||||
Pop and deallocate the top token stack/context/textbuffer. | |||||
*/ | |||||
void Tokenizer_delete_top_of_stack(Tokenizer* self) | |||||
{ | |||||
Stack* top = self->topstack; | |||||
Py_DECREF(top->stack); | |||||
Textbuffer_dealloc(top->textbuffer); | |||||
self->topstack = top->next; | |||||
free(top); | |||||
self->depth--; | |||||
} | |||||
/* | |||||
Pop the current stack/context/textbuffer, returing the stack. | |||||
*/ | |||||
PyObject* Tokenizer_pop(Tokenizer* self) | |||||
{ | |||||
PyObject* stack; | |||||
if (Tokenizer_push_textbuffer(self)) | |||||
return NULL; | |||||
stack = self->topstack->stack; | |||||
Py_INCREF(stack); | |||||
Tokenizer_delete_top_of_stack(self); | |||||
return stack; | |||||
} | |||||
/* | |||||
Pop the current stack/context/textbuffer, returing the stack. We will also | |||||
replace the underlying stack's context with the current stack's. | |||||
*/ | |||||
PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | |||||
{ | |||||
PyObject* stack; | |||||
uint64_t context; | |||||
if (Tokenizer_push_textbuffer(self)) | |||||
return NULL; | |||||
stack = self->topstack->stack; | |||||
Py_INCREF(stack); | |||||
context = self->topstack->context; | |||||
Tokenizer_delete_top_of_stack(self); | |||||
self->topstack->context = context; | |||||
return stack; | |||||
} | |||||
/* | |||||
Fail the current tokenization route. Discards the current | |||||
stack/context/textbuffer and sets the BAD_ROUTE flag. | |||||
*/ | |||||
void* Tokenizer_fail_route(Tokenizer* self) | |||||
{ | |||||
uint64_t context = self->topstack->context; | |||||
PyObject* stack = Tokenizer_pop(self); | |||||
Py_XDECREF(stack); | |||||
FAIL_ROUTE(context); | |||||
return NULL; | |||||
} | |||||
/* | |||||
Write a token to the current token stack. | |||||
*/ | |||||
int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) | |||||
{ | |||||
PyObject* instance; | |||||
if (Tokenizer_push_textbuffer(self)) | |||||
return -1; | |||||
instance = PyObject_CallObject(token, NULL); | |||||
if (!instance) | |||||
return -1; | |||||
if (first ? PyList_Insert(self->topstack->stack, 0, instance) : | |||||
PyList_Append(self->topstack->stack, instance)) { | |||||
Py_DECREF(instance); | |||||
return -1; | |||||
} | |||||
Py_DECREF(instance); | |||||
return 0; | |||||
} | |||||
/* | |||||
Write a token to the current token stack, with kwargs. Steals a reference | |||||
to kwargs. | |||||
*/ | |||||
int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, | |||||
PyObject* kwargs, int first) | |||||
{ | |||||
PyObject* instance; | |||||
if (Tokenizer_push_textbuffer(self)) { | |||||
Py_DECREF(kwargs); | |||||
return -1; | |||||
} | |||||
instance = PyObject_Call(token, NOARGS, kwargs); | |||||
if (!instance) { | |||||
Py_DECREF(kwargs); | |||||
return -1; | |||||
} | |||||
if (first ? PyList_Insert(self->topstack->stack, 0, instance): | |||||
PyList_Append(self->topstack->stack, instance)) { | |||||
Py_DECREF(instance); | |||||
Py_DECREF(kwargs); | |||||
return -1; | |||||
} | |||||
Py_DECREF(instance); | |||||
Py_DECREF(kwargs); | |||||
return 0; | |||||
} | |||||
/* | |||||
Write a Unicode codepoint to the current textbuffer. | |||||
*/ | |||||
int Tokenizer_emit_char(Tokenizer* self, Unicode code) | |||||
{ | |||||
return Textbuffer_write(self->topstack->textbuffer, code); | |||||
} | |||||
/* | |||||
Write a string of text to the current textbuffer. | |||||
*/ | |||||
int Tokenizer_emit_text(Tokenizer* self, const char* text) | |||||
{ | |||||
int i = 0; | |||||
while (text[i]) { | |||||
if (Tokenizer_emit_char(self, text[i])) | |||||
return -1; | |||||
i++; | |||||
} | |||||
return 0; | |||||
} | |||||
/* | |||||
Write the contents of another textbuffer to the current textbuffer, | |||||
deallocating it in the process. | |||||
*/ | |||||
int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer) | |||||
{ | |||||
int retval = Textbuffer_concat(self->topstack->textbuffer, buffer); | |||||
Textbuffer_dealloc(buffer); | |||||
return retval; | |||||
} | |||||
/* | |||||
Write a series of tokens to the current stack at once. | |||||
*/ | |||||
int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) | |||||
{ | |||||
int pushed = 0; | |||||
PyObject *stack, *token, *left, *right, *text; | |||||
Textbuffer* buffer; | |||||
Py_ssize_t size; | |||||
if (PyList_GET_SIZE(tokenlist) > 0) { | |||||
token = PyList_GET_ITEM(tokenlist, 0); | |||||
switch (PyObject_IsInstance(token, Text)) { | |||||
case 0: | |||||
break; | |||||
case 1: { | |||||
pushed = 1; | |||||
buffer = self->topstack->textbuffer; | |||||
if (buffer->length == 0) | |||||
break; | |||||
left = Textbuffer_render(buffer); | |||||
if (!left) | |||||
return -1; | |||||
right = PyObject_GetAttrString(token, "text"); | |||||
if (!right) | |||||
return -1; | |||||
text = PyUnicode_Concat(left, right); | |||||
Py_DECREF(left); | |||||
Py_DECREF(right); | |||||
if (!text) | |||||
return -1; | |||||
if (PyObject_SetAttrString(token, "text", text)) { | |||||
Py_DECREF(text); | |||||
return -1; | |||||
} | |||||
Py_DECREF(text); | |||||
if (Textbuffer_reset(buffer)) | |||||
return -1; | |||||
break; | |||||
} | |||||
case -1: | |||||
return -1; | |||||
} | |||||
} | |||||
if (!pushed) { | |||||
if (Tokenizer_push_textbuffer(self)) | |||||
return -1; | |||||
} | |||||
stack = self->topstack->stack; | |||||
size = PyList_GET_SIZE(stack); | |||||
if (PyList_SetSlice(stack, size, size, tokenlist)) | |||||
return -1; | |||||
return 0; | |||||
} | |||||
/* | |||||
Pop the current stack, write text, and then write the stack. 'text' is a | |||||
NULL-terminated array of chars. | |||||
*/ | |||||
int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) | |||||
{ | |||||
PyObject* stack = Tokenizer_pop(self); | |||||
if (Tokenizer_emit_text(self, text)) { | |||||
Py_DECREF(stack); | |||||
return -1; | |||||
} | |||||
if (stack) { | |||||
if (PyList_GET_SIZE(stack) > 0) { | |||||
if (Tokenizer_emit_all(self, stack)) { | |||||
Py_DECREF(stack); | |||||
return -1; | |||||
} | |||||
} | |||||
Py_DECREF(stack); | |||||
} | |||||
self->head--; | |||||
return 0; | |||||
} | |||||
/* | |||||
Internal function to read the codepoint at the given index from the input. | |||||
*/ | |||||
static Unicode read_codepoint(TokenizerInput* text, Py_ssize_t index) | |||||
{ | |||||
#ifdef PEP_393 | |||||
return PyUnicode_READ(text->kind, text->data, index); | |||||
#else | |||||
return text->buf[index]; | |||||
#endif | |||||
} | |||||
/* | |||||
Read the value at a relative point in the wikicode, forwards. | |||||
*/ | |||||
Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta) | |||||
{ | |||||
Py_ssize_t index = self->head + delta; | |||||
if (index >= self->text.length) | |||||
return '\0'; | |||||
return read_codepoint(&self->text, index); | |||||
} | |||||
/* | |||||
Read the value at a relative point in the wikicode, backwards. | |||||
*/ | |||||
Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) | |||||
{ | |||||
Py_ssize_t index; | |||||
if (delta > self->head) | |||||
return '\0'; | |||||
index = self->head - delta; | |||||
return read_codepoint(&self->text, index); | |||||
} |
@@ -0,0 +1,62 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#include "common.h" | |||||
/* Functions */ | |||||
int Tokenizer_push(Tokenizer*, uint64_t); | |||||
int Tokenizer_push_textbuffer(Tokenizer*); | |||||
void Tokenizer_delete_top_of_stack(Tokenizer*); | |||||
PyObject* Tokenizer_pop(Tokenizer*); | |||||
PyObject* Tokenizer_pop_keeping_context(Tokenizer*); | |||||
void* Tokenizer_fail_route(Tokenizer*); | |||||
int Tokenizer_emit_token(Tokenizer*, PyObject*, int); | |||||
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); | |||||
int Tokenizer_emit_char(Tokenizer*, Unicode); | |||||
int Tokenizer_emit_text(Tokenizer*, const char*); | |||||
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*); | |||||
int Tokenizer_emit_all(Tokenizer*, PyObject*); | |||||
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*); | |||||
Unicode Tokenizer_read(Tokenizer*, Py_ssize_t); | |||||
Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); | |||||
/* Macros */ | |||||
#define MAX_DEPTH 40 | |||||
#define MAX_CYCLES 100000 | |||||
#define Tokenizer_CAN_RECURSE(self) \ | |||||
(self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) | |||||
#define Tokenizer_emit(self, token) \ | |||||
Tokenizer_emit_token(self, token, 0) | |||||
#define Tokenizer_emit_first(self, token) \ | |||||
Tokenizer_emit_token(self, token, 1) | |||||
#define Tokenizer_emit_kwargs(self, token, kwargs) \ | |||||
Tokenizer_emit_token_kwargs(self, token, kwargs, 0) | |||||
#define Tokenizer_emit_first_kwargs(self, token, kwargs) \ | |||||
Tokenizer_emit_token_kwargs(self, token, kwargs, 1) |
@@ -0,0 +1,310 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#include "tokenizer.h" | |||||
#include "tok_parse.h" | |||||
#include "tokens.h" | |||||
/* Globals */ | |||||
int route_state; | |||||
uint64_t route_context; | |||||
char** entitydefs; | |||||
PyObject* NOARGS; | |||||
PyObject* definitions; | |||||
static PyObject* ParserError; | |||||
/* Forward declarations */ | |||||
static int load_exceptions(void); | |||||
/* | |||||
Create a new tokenizer object. | |||||
*/ | |||||
static PyObject* | |||||
Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) | |||||
{ | |||||
Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0); | |||||
return (PyObject*) self; | |||||
} | |||||
/* | |||||
Deallocate the given tokenizer's text field. | |||||
*/ | |||||
static void dealloc_tokenizer_text(TokenizerInput* text) | |||||
{ | |||||
Py_XDECREF(text->object); | |||||
} | |||||
/* | |||||
Deallocate the given tokenizer object. | |||||
*/ | |||||
static void Tokenizer_dealloc(Tokenizer* self) | |||||
{ | |||||
Stack *this = self->topstack, *next; | |||||
dealloc_tokenizer_text(&self->text); | |||||
while (this) { | |||||
Py_DECREF(this->stack); | |||||
Textbuffer_dealloc(this->textbuffer); | |||||
next = this->next; | |||||
free(this); | |||||
this = next; | |||||
} | |||||
Py_TYPE(self)->tp_free((PyObject*) self); | |||||
} | |||||
/* | |||||
Initialize a new tokenizer instance's text field. | |||||
*/ | |||||
static void init_tokenizer_text(TokenizerInput* text) | |||||
{ | |||||
text->object = Py_None; | |||||
Py_INCREF(Py_None); | |||||
text->length = 0; | |||||
#ifdef PEP_393 | |||||
text->kind = PyUnicode_WCHAR_KIND; | |||||
text->data = NULL; | |||||
#else | |||||
text->buf = NULL; | |||||
#endif | |||||
} | |||||
/* | |||||
Initialize a new tokenizer instance by setting instance attributes. | |||||
*/ | |||||
static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) | |||||
{ | |||||
static char* kwlist[] = {NULL}; | |||||
if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) | |||||
return -1; | |||||
init_tokenizer_text(&self->text); | |||||
self->topstack = NULL; | |||||
self->head = self->global = self->depth = self->cycles = 0; | |||||
self->route_context = self->route_state = 0; | |||||
self->skip_style_tags = 0; | |||||
return 0; | |||||
} | |||||
/* | |||||
Load input text into the tokenizer. | |||||
*/ | |||||
static int load_tokenizer_text(TokenizerInput* text, PyObject *input) | |||||
{ | |||||
dealloc_tokenizer_text(text); | |||||
text->object = input; | |||||
#ifdef PEP_393 | |||||
if (PyUnicode_READY(input) < 0) | |||||
return -1; | |||||
text->kind = PyUnicode_KIND(input); | |||||
text->data = PyUnicode_DATA(input); | |||||
#else | |||||
text->buf = PyUnicode_AS_UNICODE(input); | |||||
#endif | |||||
text->length = PyUnicode_GET_LENGTH(input); | |||||
return 0; | |||||
} | |||||
/* | |||||
Build a list of tokens from a string of wikicode and return it. | |||||
*/ | |||||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
{ | |||||
PyObject *input, *tokens; | |||||
uint64_t context = 0; | |||||
int skip_style_tags = 0; | |||||
if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) { | |||||
Py_INCREF(input); | |||||
if (load_tokenizer_text(&self->text, input)) | |||||
return NULL; | |||||
} | |||||
else { | |||||
const char *encoded; | |||||
Py_ssize_t size; | |||||
/* Failed to parse a Unicode object; try a string instead. */ | |||||
PyErr_Clear(); | |||||
if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context, | |||||
&skip_style_tags)) | |||||
return NULL; | |||||
if (!(input = PyUnicode_FromStringAndSize(encoded, size))) | |||||
return NULL; | |||||
if (load_tokenizer_text(&self->text, input)) | |||||
return NULL; | |||||
} | |||||
self->head = self->global = self->depth = self->cycles = 0; | |||||
self->skip_style_tags = skip_style_tags; | |||||
tokens = Tokenizer_parse(self, context, 1); | |||||
if ((!tokens && !PyErr_Occurred()) || self->topstack) { | |||||
if (!ParserError) { | |||||
if (load_exceptions()) | |||||
return NULL; | |||||
} | |||||
if (BAD_ROUTE) { | |||||
RESET_ROUTE(); | |||||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | |||||
} | |||||
else if (self->topstack) | |||||
PyErr_SetString(ParserError, | |||||
"C tokenizer exited with non-empty token stack"); | |||||
else | |||||
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); | |||||
return NULL; | |||||
} | |||||
return tokens; | |||||
} | |||||
static int load_entities(void) | |||||
{ | |||||
PyObject *tempmod, *defmap, *deflist; | |||||
unsigned numdefs, i; | |||||
#ifdef IS_PY3K | |||||
PyObject *string; | |||||
#endif | |||||
tempmod = PyImport_ImportModule(ENTITYDEFS_MODULE); | |||||
if (!tempmod) | |||||
return -1; | |||||
defmap = PyObject_GetAttrString(tempmod, "entitydefs"); | |||||
if (!defmap) | |||||
return -1; | |||||
Py_DECREF(tempmod); | |||||
deflist = PyDict_Keys(defmap); | |||||
if (!deflist) | |||||
return -1; | |||||
Py_DECREF(defmap); | |||||
numdefs = (unsigned) PyList_GET_SIZE(defmap); | |||||
entitydefs = calloc(numdefs + 1, sizeof(char*)); | |||||
if (!entitydefs) | |||||
return -1; | |||||
for (i = 0; i < numdefs; i++) { | |||||
#ifdef IS_PY3K | |||||
string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i)); | |||||
if (!string) | |||||
return -1; | |||||
entitydefs[i] = PyBytes_AsString(string); | |||||
#else | |||||
entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); | |||||
#endif | |||||
if (!entitydefs[i]) | |||||
return -1; | |||||
} | |||||
Py_DECREF(deflist); | |||||
return 0; | |||||
} | |||||
static int load_tokens(void) | |||||
{ | |||||
PyObject *tempmod, *tokens, | |||||
*globals = PyEval_GetGlobals(), | |||||
*locals = PyEval_GetLocals(), | |||||
*fromlist = PyList_New(1), | |||||
*modname = IMPORT_NAME_FUNC("tokens"); | |||||
char *name = "mwparserfromhell.parser"; | |||||
if (!fromlist || !modname) | |||||
return -1; | |||||
PyList_SET_ITEM(fromlist, 0, modname); | |||||
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); | |||||
Py_DECREF(fromlist); | |||||
if (!tempmod) | |||||
return -1; | |||||
tokens = PyObject_GetAttrString(tempmod, "tokens"); | |||||
Py_DECREF(tempmod); | |||||
load_tokens_from_module(tokens); | |||||
Py_DECREF(tokens); | |||||
return 0; | |||||
} | |||||
static int load_defs(void) | |||||
{ | |||||
PyObject *tempmod, | |||||
*globals = PyEval_GetGlobals(), | |||||
*locals = PyEval_GetLocals(), | |||||
*fromlist = PyList_New(1), | |||||
*modname = IMPORT_NAME_FUNC("definitions"); | |||||
char *name = "mwparserfromhell"; | |||||
if (!fromlist || !modname) | |||||
return -1; | |||||
PyList_SET_ITEM(fromlist, 0, modname); | |||||
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); | |||||
Py_DECREF(fromlist); | |||||
if (!tempmod) | |||||
return -1; | |||||
definitions = PyObject_GetAttrString(tempmod, "definitions"); | |||||
Py_DECREF(tempmod); | |||||
return 0; | |||||
} | |||||
static int load_exceptions(void) | |||||
{ | |||||
PyObject *tempmod, *parsermod, | |||||
*globals = PyEval_GetGlobals(), | |||||
*locals = PyEval_GetLocals(), | |||||
*fromlist = PyList_New(1), | |||||
*modname = IMPORT_NAME_FUNC("parser"); | |||||
char *name = "mwparserfromhell"; | |||||
if (!fromlist || !modname) | |||||
return -1; | |||||
PyList_SET_ITEM(fromlist, 0, modname); | |||||
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); | |||||
Py_DECREF(fromlist); | |||||
if (!tempmod) | |||||
return -1; | |||||
parsermod = PyObject_GetAttrString(tempmod, "parser"); | |||||
Py_DECREF(tempmod); | |||||
ParserError = PyObject_GetAttrString(parsermod, "ParserError"); | |||||
Py_DECREF(parsermod); | |||||
return 0; | |||||
} | |||||
PyMODINIT_FUNC INIT_FUNC_NAME(void) | |||||
{ | |||||
PyObject *module; | |||||
TokenizerType.tp_new = PyType_GenericNew; | |||||
if (PyType_Ready(&TokenizerType) < 0) | |||||
INIT_ERROR; | |||||
module = CREATE_MODULE; | |||||
if (!module) | |||||
INIT_ERROR; | |||||
Py_INCREF(&TokenizerType); | |||||
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); | |||||
Py_INCREF(Py_True); | |||||
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); | |||||
NOARGS = PyTuple_New(0); | |||||
if (!NOARGS || load_entities() || load_tokens() || load_defs()) | |||||
INIT_ERROR; | |||||
#ifdef IS_PY3K | |||||
return module; | |||||
#endif | |||||
} |
@@ -0,0 +1,111 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#include "common.h" | |||||
#include "textbuffer.h" | |||||
/* Functions */ | |||||
static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | |||||
static void Tokenizer_dealloc(Tokenizer*); | |||||
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | |||||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | |||||
/* Compatibility macros */ | |||||
#ifdef IS_PY3K | |||||
#define IMPORT_NAME_FUNC PyUnicode_FromString | |||||
#define CREATE_MODULE PyModule_Create(&module_def); | |||||
#define ENTITYDEFS_MODULE "html.entities" | |||||
#define INIT_FUNC_NAME PyInit__tokenizer | |||||
#define INIT_ERROR return NULL | |||||
#else | |||||
#define IMPORT_NAME_FUNC PyBytes_FromString | |||||
#define CREATE_MODULE Py_InitModule("_tokenizer", NULL); | |||||
#define ENTITYDEFS_MODULE "htmlentitydefs" | |||||
#define INIT_FUNC_NAME init_tokenizer | |||||
#define INIT_ERROR return | |||||
#endif | |||||
/* Structs */ | |||||
static PyMethodDef Tokenizer_methods[] = { | |||||
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, | |||||
"Build a list of tokens from a string of wikicode and return it."}, | |||||
{NULL} | |||||
}; | |||||
static PyMemberDef Tokenizer_members[] = { | |||||
{NULL} | |||||
}; | |||||
static PyTypeObject TokenizerType = { | |||||
PyVarObject_HEAD_INIT(NULL, 0) | |||||
"_tokenizer.CTokenizer", /* tp_name */ | |||||
sizeof(Tokenizer), /* tp_basicsize */ | |||||
0, /* tp_itemsize */ | |||||
(destructor) Tokenizer_dealloc, /* tp_dealloc */ | |||||
0, /* tp_print */ | |||||
0, /* tp_getattr */ | |||||
0, /* tp_setattr */ | |||||
0, /* tp_compare */ | |||||
0, /* tp_repr */ | |||||
0, /* tp_as_number */ | |||||
0, /* tp_as_sequence */ | |||||
0, /* tp_as_mapping */ | |||||
0, /* tp_hash */ | |||||
0, /* tp_call */ | |||||
0, /* tp_str */ | |||||
0, /* tp_getattro */ | |||||
0, /* tp_setattro */ | |||||
0, /* tp_as_buffer */ | |||||
Py_TPFLAGS_DEFAULT, /* tp_flags */ | |||||
"Creates a list of tokens from a string of wikicode.", /* tp_doc */ | |||||
0, /* tp_traverse */ | |||||
0, /* tp_clear */ | |||||
0, /* tp_richcompare */ | |||||
0, /* tp_weaklistoffset */ | |||||
0, /* tp_iter */ | |||||
0, /* tp_iternext */ | |||||
Tokenizer_methods, /* tp_methods */ | |||||
Tokenizer_members, /* tp_members */ | |||||
0, /* tp_getset */ | |||||
0, /* tp_base */ | |||||
0, /* tp_dict */ | |||||
0, /* tp_descr_get */ | |||||
0, /* tp_descr_set */ | |||||
0, /* tp_dictoffset */ | |||||
(initproc) Tokenizer_init, /* tp_init */ | |||||
0, /* tp_alloc */ | |||||
Tokenizer_new, /* tp_new */ | |||||
}; | |||||
#ifdef IS_PY3K | |||||
static PyModuleDef module_def = { | |||||
PyModuleDef_HEAD_INIT, | |||||
"_tokenizer", | |||||
"Creates a list of tokens from a string of wikicode.", | |||||
-1, NULL, NULL, NULL, NULL, NULL | |||||
}; | |||||
#endif |
@@ -0,0 +1,111 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#include "tokens.h" | |||||
/* Globals */ | |||||
PyObject* Text; | |||||
PyObject* TemplateOpen; | |||||
PyObject* TemplateParamSeparator; | |||||
PyObject* TemplateParamEquals; | |||||
PyObject* TemplateClose; | |||||
PyObject* ArgumentOpen; | |||||
PyObject* ArgumentSeparator; | |||||
PyObject* ArgumentClose; | |||||
PyObject* WikilinkOpen; | |||||
PyObject* WikilinkSeparator; | |||||
PyObject* WikilinkClose; | |||||
PyObject* ExternalLinkOpen; | |||||
PyObject* ExternalLinkSeparator; | |||||
PyObject* ExternalLinkClose; | |||||
PyObject* HTMLEntityStart; | |||||
PyObject* HTMLEntityNumeric; | |||||
PyObject* HTMLEntityHex; | |||||
PyObject* HTMLEntityEnd; | |||||
PyObject* HeadingStart; | |||||
PyObject* HeadingEnd; | |||||
PyObject* CommentStart; | |||||
PyObject* CommentEnd; | |||||
PyObject* TagOpenOpen; | |||||
PyObject* TagAttrStart; | |||||
PyObject* TagAttrEquals; | |||||
PyObject* TagAttrQuote; | |||||
PyObject* TagCloseOpen; | |||||
PyObject* TagCloseSelfclose; | |||||
PyObject* TagOpenClose; | |||||
PyObject* TagCloseClose; | |||||
/* | |||||
Load individual tokens into globals from the given Python module object. | |||||
*/ | |||||
void load_tokens_from_module(PyObject* module) | |||||
{ | |||||
Text = PyObject_GetAttrString(module, "Text"); | |||||
TemplateOpen = PyObject_GetAttrString(module, "TemplateOpen"); | |||||
TemplateParamSeparator = PyObject_GetAttrString(module, | |||||
"TemplateParamSeparator"); | |||||
TemplateParamEquals = PyObject_GetAttrString(module, | |||||
"TemplateParamEquals"); | |||||
TemplateClose = PyObject_GetAttrString(module, "TemplateClose"); | |||||
ArgumentOpen = PyObject_GetAttrString(module, "ArgumentOpen"); | |||||
ArgumentSeparator = PyObject_GetAttrString(module, "ArgumentSeparator"); | |||||
ArgumentClose = PyObject_GetAttrString(module, "ArgumentClose"); | |||||
WikilinkOpen = PyObject_GetAttrString(module, "WikilinkOpen"); | |||||
WikilinkSeparator = PyObject_GetAttrString(module, "WikilinkSeparator"); | |||||
WikilinkClose = PyObject_GetAttrString(module, "WikilinkClose"); | |||||
ExternalLinkOpen = PyObject_GetAttrString(module, "ExternalLinkOpen"); | |||||
ExternalLinkSeparator = PyObject_GetAttrString(module, | |||||
"ExternalLinkSeparator"); | |||||
ExternalLinkClose = PyObject_GetAttrString(module, "ExternalLinkClose"); | |||||
HTMLEntityStart = PyObject_GetAttrString(module, "HTMLEntityStart"); | |||||
HTMLEntityNumeric = PyObject_GetAttrString(module, "HTMLEntityNumeric"); | |||||
HTMLEntityHex = PyObject_GetAttrString(module, "HTMLEntityHex"); | |||||
HTMLEntityEnd = PyObject_GetAttrString(module, "HTMLEntityEnd"); | |||||
HeadingStart = PyObject_GetAttrString(module, "HeadingStart"); | |||||
HeadingEnd = PyObject_GetAttrString(module, "HeadingEnd"); | |||||
CommentStart = PyObject_GetAttrString(module, "CommentStart"); | |||||
CommentEnd = PyObject_GetAttrString(module, "CommentEnd"); | |||||
TagOpenOpen = PyObject_GetAttrString(module, "TagOpenOpen"); | |||||
TagAttrStart = PyObject_GetAttrString(module, "TagAttrStart"); | |||||
TagAttrEquals = PyObject_GetAttrString(module, "TagAttrEquals"); | |||||
TagAttrQuote = PyObject_GetAttrString(module, "TagAttrQuote"); | |||||
TagCloseOpen = PyObject_GetAttrString(module, "TagCloseOpen"); | |||||
TagCloseSelfclose = PyObject_GetAttrString(module, "TagCloseSelfclose"); | |||||
TagOpenClose = PyObject_GetAttrString(module, "TagOpenClose"); | |||||
TagCloseClose = PyObject_GetAttrString(module, "TagCloseClose"); | |||||
} |
@@ -0,0 +1,69 @@ | |||||
/* | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#pragma once | |||||
#include "common.h" | |||||
/* Token globals */ | |||||
extern PyObject* Text; | |||||
extern PyObject* TemplateOpen; | |||||
extern PyObject* TemplateParamSeparator; | |||||
extern PyObject* TemplateParamEquals; | |||||
extern PyObject* TemplateClose; | |||||
extern PyObject* ArgumentOpen; | |||||
extern PyObject* ArgumentSeparator; | |||||
extern PyObject* ArgumentClose; | |||||
extern PyObject* WikilinkOpen; | |||||
extern PyObject* WikilinkSeparator; | |||||
extern PyObject* WikilinkClose; | |||||
extern PyObject* ExternalLinkOpen; | |||||
extern PyObject* ExternalLinkSeparator; | |||||
extern PyObject* ExternalLinkClose; | |||||
extern PyObject* HTMLEntityStart; | |||||
extern PyObject* HTMLEntityNumeric; | |||||
extern PyObject* HTMLEntityHex; | |||||
extern PyObject* HTMLEntityEnd; | |||||
extern PyObject* HeadingStart; | |||||
extern PyObject* HeadingEnd; | |||||
extern PyObject* CommentStart; | |||||
extern PyObject* CommentEnd; | |||||
extern PyObject* TagOpenOpen; | |||||
extern PyObject* TagAttrStart; | |||||
extern PyObject* TagAttrEquals; | |||||
extern PyObject* TagAttrQuote; | |||||
extern PyObject* TagCloseOpen; | |||||
extern PyObject* TagCloseSelfclose; | |||||
extern PyObject* TagOpenClose; | |||||
extern PyObject* TagCloseClose; | |||||
/* Functions */ | |||||
void load_tokens_from_module(PyObject*); |
@@ -1,366 +0,0 @@ | |||||
/* | |||||
Tokenizer Header File for MWParserFromHell | |||||
Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||||
this software and associated documentation files (the "Software"), to deal in | |||||
the Software without restriction, including without limitation the rights to | |||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |||||
of the Software, and to permit persons to whom the Software is furnished to do | |||||
so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
*/ | |||||
#ifndef PY_SSIZE_T_CLEAN | |||||
#define PY_SSIZE_T_CLEAN | |||||
#endif | |||||
#include <Python.h> | |||||
#include <math.h> | |||||
#include <structmember.h> | |||||
#include <bytesobject.h> | |||||
#include <stdint.h> | |||||
#if PY_MAJOR_VERSION >= 3 | |||||
#define IS_PY3K | |||||
#endif | |||||
#define malloc PyObject_Malloc | |||||
#define free PyObject_Free | |||||
#define DIGITS "0123456789" | |||||
#define HEXDIGITS "0123456789abcdefABCDEF" | |||||
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | |||||
static const char MARKERS[] = { | |||||
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', | |||||
'-', '!', '\n', '\0'}; | |||||
#define NUM_MARKERS 19 | |||||
#define TEXTBUFFER_BLOCKSIZE 1024 | |||||
#define MAX_DEPTH 40 | |||||
#define MAX_CYCLES 100000 | |||||
#define MAX_BRACES 255 | |||||
#define MAX_ENTITY_SIZE 8 | |||||
static int route_state = 0; | |||||
static uint64_t route_context = 0; | |||||
#define BAD_ROUTE route_state | |||||
#define BAD_ROUTE_CONTEXT route_context | |||||
#define FAIL_ROUTE(context) route_state = 1; route_context = context | |||||
#define RESET_ROUTE() route_state = 0 | |||||
static char** entitydefs; | |||||
static PyObject* EMPTY; | |||||
static PyObject* NOARGS; | |||||
static PyObject* ParserError; | |||||
static PyObject* definitions; | |||||
/* Tokens: */ | |||||
static PyObject* Text; | |||||
static PyObject* TemplateOpen; | |||||
static PyObject* TemplateParamSeparator; | |||||
static PyObject* TemplateParamEquals; | |||||
static PyObject* TemplateClose; | |||||
static PyObject* ArgumentOpen; | |||||
static PyObject* ArgumentSeparator; | |||||
static PyObject* ArgumentClose; | |||||
static PyObject* WikilinkOpen; | |||||
static PyObject* WikilinkSeparator; | |||||
static PyObject* WikilinkClose; | |||||
static PyObject* ExternalLinkOpen; | |||||
static PyObject* ExternalLinkSeparator; | |||||
static PyObject* ExternalLinkClose; | |||||
static PyObject* HTMLEntityStart; | |||||
static PyObject* HTMLEntityNumeric; | |||||
static PyObject* HTMLEntityHex; | |||||
static PyObject* HTMLEntityEnd; | |||||
static PyObject* HeadingStart; | |||||
static PyObject* HeadingEnd; | |||||
static PyObject* CommentStart; | |||||
static PyObject* CommentEnd; | |||||
static PyObject* TagOpenOpen; | |||||
static PyObject* TagAttrStart; | |||||
static PyObject* TagAttrEquals; | |||||
static PyObject* TagAttrQuote; | |||||
static PyObject* TagCloseOpen; | |||||
static PyObject* TagCloseSelfclose; | |||||
static PyObject* TagOpenClose; | |||||
static PyObject* TagCloseClose; | |||||
/* Local contexts: */ | |||||
#define LC_TEMPLATE 0x0000000000000007 | |||||
#define LC_TEMPLATE_NAME 0x0000000000000001 | |||||
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 | |||||
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 | |||||
#define LC_ARGUMENT 0x0000000000000018 | |||||
#define LC_ARGUMENT_NAME 0x0000000000000008 | |||||
#define LC_ARGUMENT_DEFAULT 0x0000000000000010 | |||||
#define LC_WIKILINK 0x0000000000000060 | |||||
#define LC_WIKILINK_TITLE 0x0000000000000020 | |||||
#define LC_WIKILINK_TEXT 0x0000000000000040 | |||||
#define LC_EXT_LINK 0x0000000000000180 | |||||
#define LC_EXT_LINK_URI 0x0000000000000080 | |||||
#define LC_EXT_LINK_TITLE 0x0000000000000100 | |||||
#define LC_HEADING 0x0000000000007E00 | |||||
#define LC_HEADING_LEVEL_1 0x0000000000000200 | |||||
#define LC_HEADING_LEVEL_2 0x0000000000000400 | |||||
#define LC_HEADING_LEVEL_3 0x0000000000000800 | |||||
#define LC_HEADING_LEVEL_4 0x0000000000001000 | |||||
#define LC_HEADING_LEVEL_5 0x0000000000002000 | |||||
#define LC_HEADING_LEVEL_6 0x0000000000004000 | |||||
#define LC_TAG 0x0000000000078000 | |||||
#define LC_TAG_OPEN 0x0000000000008000 | |||||
#define LC_TAG_ATTR 0x0000000000010000 | |||||
#define LC_TAG_BODY 0x0000000000020000 | |||||
#define LC_TAG_CLOSE 0x0000000000040000 | |||||
#define LC_STYLE 0x0000000000780000 | |||||
#define LC_STYLE_ITALICS 0x0000000000080000 | |||||
#define LC_STYLE_BOLD 0x0000000000100000 | |||||
#define LC_STYLE_PASS_AGAIN 0x0000000000200000 | |||||
#define LC_STYLE_SECOND_PASS 0x0000000000400000 | |||||
#define LC_DLTERM 0x0000000000800000 | |||||
#define LC_SAFETY_CHECK 0x000000003F000000 | |||||
#define LC_HAS_TEXT 0x0000000001000000 | |||||
#define LC_FAIL_ON_TEXT 0x0000000002000000 | |||||
#define LC_FAIL_NEXT 0x0000000004000000 | |||||
#define LC_FAIL_ON_LBRACE 0x0000000008000000 | |||||
#define LC_FAIL_ON_RBRACE 0x0000000010000000 | |||||
#define LC_FAIL_ON_EQUALS 0x0000000020000000 | |||||
#define LC_TABLE 0x0000000FC0000000 | |||||
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 | |||||
#define LC_TABLE_OPEN 0x0000000040000000 | |||||
#define LC_TABLE_CELL_OPEN 0x0000000080000000 | |||||
#define LC_TABLE_CELL_STYLE 0x0000000100000000 | |||||
#define LC_TABLE_ROW_OPEN 0x0000000200000000 | |||||
#define LC_TABLE_TD_LINE 0x0000000400000000 | |||||
#define LC_TABLE_TH_LINE 0x0000000800000000 | |||||
/* Global contexts: */ | |||||
#define GL_HEADING 0x1 | |||||
/* Aggregate contexts: */ | |||||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) | |||||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) | |||||
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | |||||
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | |||||
/* Tag contexts: */ | |||||
#define TAG_NAME 0x01 | |||||
#define TAG_ATTR_READY 0x02 | |||||
#define TAG_ATTR_NAME 0x04 | |||||
#define TAG_ATTR_VALUE 0x08 | |||||
#define TAG_QUOTED 0x10 | |||||
#define TAG_NOTE_SPACE 0x20 | |||||
#define TAG_NOTE_EQUALS 0x40 | |||||
#define TAG_NOTE_QUOTE 0x80 | |||||
/* Miscellaneous structs: */ | |||||
struct Textbuffer { | |||||
Py_ssize_t size; | |||||
Py_UNICODE* data; | |||||
struct Textbuffer* prev; | |||||
struct Textbuffer* next; | |||||
}; | |||||
struct Stack { | |||||
PyObject* stack; | |||||
uint64_t context; | |||||
struct Textbuffer* textbuffer; | |||||
struct Stack* next; | |||||
}; | |||||
typedef struct { | |||||
PyObject* title; | |||||
int level; | |||||
} HeadingData; | |||||
typedef struct { | |||||
uint64_t context; | |||||
struct Textbuffer* pad_first; | |||||
struct Textbuffer* pad_before_eq; | |||||
struct Textbuffer* pad_after_eq; | |||||
Py_UNICODE quoter; | |||||
Py_ssize_t reset; | |||||
} TagData; | |||||
typedef struct Textbuffer Textbuffer; | |||||
typedef struct Stack Stack; | |||||
/* Tokenizer object definition: */ | |||||
typedef struct { | |||||
PyObject_HEAD | |||||
PyObject* text; /* text to tokenize */ | |||||
Stack* topstack; /* topmost stack */ | |||||
Py_ssize_t head; /* current position in text */ | |||||
Py_ssize_t length; /* length of text */ | |||||
int global; /* global context */ | |||||
int depth; /* stack recursion depth */ | |||||
int cycles; /* total number of stack recursions */ | |||||
int skip_style_tags; /* temporary fix for the sometimes broken tag parser */ | |||||
} Tokenizer; | |||||
/* Macros related to Tokenizer functions: */ | |||||
#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))) | |||||
#define Tokenizer_READ_BACKWARDS(self, delta) \ | |||||
(*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta))) | |||||
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) | |||||
#define Tokenizer_emit(self, token) Tokenizer_emit_token(self, token, 0) | |||||
#define Tokenizer_emit_first(self, token) Tokenizer_emit_token(self, token, 1) | |||||
#define Tokenizer_emit_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 0) | |||||
#define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1) | |||||
/* Macros for accessing definitions: */ | |||||
#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") | |||||
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL)) | |||||
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL)) | |||||
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL)) | |||||
#define IS_SCHEME(scheme, slashes, reverse) \ | |||||
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) | |||||
/* Function prototypes: */ | |||||
static Textbuffer* Textbuffer_new(void); | |||||
static void Textbuffer_dealloc(Textbuffer*); | |||||
static TagData* TagData_new(void); | |||||
static void TagData_dealloc(TagData*); | |||||
static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | |||||
static void Tokenizer_dealloc(Tokenizer*); | |||||
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | |||||
static int Tokenizer_parse_entity(Tokenizer*); | |||||
static int Tokenizer_parse_comment(Tokenizer*); | |||||
static int Tokenizer_handle_dl_term(Tokenizer*); | |||||
static int Tokenizer_parse_tag(Tokenizer*); | |||||
static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); | |||||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | |||||
static int load_exceptions(void); | |||||
/* Macros for Python 2/3 compatibility: */ | |||||
#ifdef IS_PY3K | |||||
#define NEW_INT_FUNC PyLong_FromSsize_t | |||||
#define IMPORT_NAME_FUNC PyUnicode_FromString | |||||
#define CREATE_MODULE PyModule_Create(&module_def); | |||||
#define ENTITYDEFS_MODULE "html.entities" | |||||
#define INIT_FUNC_NAME PyInit__tokenizer | |||||
#define INIT_ERROR return NULL | |||||
#else | |||||
#define NEW_INT_FUNC PyInt_FromSsize_t | |||||
#define IMPORT_NAME_FUNC PyBytes_FromString | |||||
#define CREATE_MODULE Py_InitModule("_tokenizer", NULL); | |||||
#define ENTITYDEFS_MODULE "htmlentitydefs" | |||||
#define INIT_FUNC_NAME init_tokenizer | |||||
#define INIT_ERROR return | |||||
#endif | |||||
/* More structs for creating the Tokenizer type: */ | |||||
static PyMethodDef Tokenizer_methods[] = { | |||||
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, | |||||
"Build a list of tokens from a string of wikicode and return it."}, | |||||
{NULL} | |||||
}; | |||||
static PyMemberDef Tokenizer_members[] = { | |||||
{NULL} | |||||
}; | |||||
static PyTypeObject TokenizerType = { | |||||
PyVarObject_HEAD_INIT(NULL, 0) | |||||
"_tokenizer.CTokenizer", /* tp_name */ | |||||
sizeof(Tokenizer), /* tp_basicsize */ | |||||
0, /* tp_itemsize */ | |||||
(destructor) Tokenizer_dealloc, /* tp_dealloc */ | |||||
0, /* tp_print */ | |||||
0, /* tp_getattr */ | |||||
0, /* tp_setattr */ | |||||
0, /* tp_compare */ | |||||
0, /* tp_repr */ | |||||
0, /* tp_as_number */ | |||||
0, /* tp_as_sequence */ | |||||
0, /* tp_as_mapping */ | |||||
0, /* tp_hash */ | |||||
0, /* tp_call */ | |||||
0, /* tp_str */ | |||||
0, /* tp_getattro */ | |||||
0, /* tp_setattro */ | |||||
0, /* tp_as_buffer */ | |||||
Py_TPFLAGS_DEFAULT, /* tp_flags */ | |||||
"Creates a list of tokens from a string of wikicode.", /* tp_doc */ | |||||
0, /* tp_traverse */ | |||||
0, /* tp_clear */ | |||||
0, /* tp_richcompare */ | |||||
0, /* tp_weaklistoffset */ | |||||
0, /* tp_iter */ | |||||
0, /* tp_iternext */ | |||||
Tokenizer_methods, /* tp_methods */ | |||||
Tokenizer_members, /* tp_members */ | |||||
0, /* tp_getset */ | |||||
0, /* tp_base */ | |||||
0, /* tp_dict */ | |||||
0, /* tp_descr_get */ | |||||
0, /* tp_descr_set */ | |||||
0, /* tp_dictoffset */ | |||||
(initproc) Tokenizer_init, /* tp_init */ | |||||
0, /* tp_alloc */ | |||||
Tokenizer_new, /* tp_new */ | |||||
}; | |||||
#ifdef IS_PY3K | |||||
static PyModuleDef module_def = { | |||||
PyModuleDef_HEAD_INIT, | |||||
"_tokenizer", | |||||
"Creates a list of tokens from a string of wikicode.", | |||||
-1, NULL, NULL, NULL, NULL, NULL | |||||
}; | |||||
#endif |
@@ -192,11 +192,14 @@ class Tokenizer(object): | |||||
self._fail_route() | self._fail_route() | ||||
return self.END | return self.END | ||||
def _parse_template(self): | |||||
def _parse_template(self, has_content): | |||||
"""Parse a template at the head of the wikicode string.""" | """Parse a template at the head of the wikicode string.""" | ||||
reset = self._head | reset = self._head | ||||
context = contexts.TEMPLATE_NAME | |||||
if has_content: | |||||
context |= contexts.HAS_TEMPLATE | |||||
try: | try: | ||||
template = self._parse(contexts.TEMPLATE_NAME) | |||||
template = self._parse(context) | |||||
except BadRoute: | except BadRoute: | ||||
self._head = reset | self._head = reset | ||||
raise | raise | ||||
@@ -223,6 +226,7 @@ class Tokenizer(object): | |||||
while self._read() == "{": | while self._read() == "{": | ||||
self._head += 1 | self._head += 1 | ||||
braces += 1 | braces += 1 | ||||
has_content = False | |||||
self._push() | self._push() | ||||
while braces: | while braces: | ||||
@@ -230,7 +234,7 @@ class Tokenizer(object): | |||||
return self._emit_text_then_stack("{") | return self._emit_text_then_stack("{") | ||||
if braces == 2: | if braces == 2: | ||||
try: | try: | ||||
self._parse_template() | |||||
self._parse_template(has_content) | |||||
except BadRoute: | except BadRoute: | ||||
return self._emit_text_then_stack("{{") | return self._emit_text_then_stack("{{") | ||||
break | break | ||||
@@ -239,11 +243,12 @@ class Tokenizer(object): | |||||
braces -= 3 | braces -= 3 | ||||
except BadRoute: | except BadRoute: | ||||
try: | try: | ||||
self._parse_template() | |||||
self._parse_template(has_content) | |||||
braces -= 2 | braces -= 2 | ||||
except BadRoute: | except BadRoute: | ||||
return self._emit_text_then_stack("{" * braces) | return self._emit_text_then_stack("{" * braces) | ||||
if braces: | if braces: | ||||
has_content = True | |||||
self._head += 1 | self._head += 1 | ||||
self._emit_all(self._pop()) | self._emit_all(self._pop()) | ||||
@@ -253,6 +258,8 @@ class Tokenizer(object): | |||||
def _handle_template_param(self): | def _handle_template_param(self): | ||||
"""Handle a template parameter at the head of the string.""" | """Handle a template parameter at the head of the string.""" | ||||
if self._context & contexts.TEMPLATE_NAME: | if self._context & contexts.TEMPLATE_NAME: | ||||
if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE): | |||||
self._fail_route() | |||||
self._context ^= contexts.TEMPLATE_NAME | self._context ^= contexts.TEMPLATE_NAME | ||||
elif self._context & contexts.TEMPLATE_PARAM_VALUE: | elif self._context & contexts.TEMPLATE_PARAM_VALUE: | ||||
self._context ^= contexts.TEMPLATE_PARAM_VALUE | self._context ^= contexts.TEMPLATE_PARAM_VALUE | ||||
@@ -271,7 +278,10 @@ class Tokenizer(object): | |||||
def _handle_template_end(self): | def _handle_template_end(self): | ||||
"""Handle the end of a template at the head of the string.""" | """Handle the end of a template at the head of the string.""" | ||||
if self._context & contexts.TEMPLATE_PARAM_KEY: | |||||
if self._context & contexts.TEMPLATE_NAME: | |||||
if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE): | |||||
self._fail_route() | |||||
elif self._context & contexts.TEMPLATE_PARAM_KEY: | |||||
self._emit_all(self._pop(keep_context=True)) | self._emit_all(self._pop(keep_context=True)) | ||||
self._head += 1 | self._head += 1 | ||||
return self._pop() | return self._pop() | ||||
@@ -610,7 +620,7 @@ class Tokenizer(object): | |||||
self._head += 2 | self._head += 2 | ||||
if self._context & contexts.FAIL_NEXT: | if self._context & contexts.FAIL_NEXT: | ||||
# _verify_safe() sets this flag while parsing a template | # _verify_safe() sets this flag while parsing a template | ||||
# name when it encounters what might be a comment -- we | |||||
# or link when it encounters what might be a comment -- we | |||||
# must unset it to let _verify_safe() know it was correct: | # must unset it to let _verify_safe() know it was correct: | ||||
self._context ^= contexts.FAIL_NEXT | self._context ^= contexts.FAIL_NEXT | ||||
return | return | ||||
@@ -1172,29 +1182,33 @@ class Tokenizer(object): | |||||
if context & contexts.WIKILINK_TITLE: | if context & contexts.WIKILINK_TITLE: | ||||
if this == "]" or this == "{": | if this == "]" or this == "{": | ||||
self._context |= contexts.FAIL_NEXT | self._context |= contexts.FAIL_NEXT | ||||
elif this == "\n" or this == "[" or this == "}": | |||||
elif this == "\n" or this == "[" or this == "}" or this == ">": | |||||
return False | return False | ||||
elif this == "<": | |||||
if self._read(1) == "!": | |||||
self._context |= contexts.FAIL_NEXT | |||||
else: | |||||
return False | |||||
return True | return True | ||||
elif context & contexts.EXT_LINK_TITLE: | elif context & contexts.EXT_LINK_TITLE: | ||||
return this != "\n" | return this != "\n" | ||||
elif context & contexts.TEMPLATE_NAME: | elif context & contexts.TEMPLATE_NAME: | ||||
if this == "{" or this == "}" or this == "[": | |||||
if this == "{": | |||||
self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT | |||||
return True | |||||
if this == "}" or (this == "<" and self._read(1) == "!"): | |||||
self._context |= contexts.FAIL_NEXT | self._context |= contexts.FAIL_NEXT | ||||
return True | return True | ||||
if this == "]": | |||||
if this == "[" or this == "]" or this == "<" or this == ">": | |||||
return False | return False | ||||
if this == "|": | if this == "|": | ||||
return True | return True | ||||
if context & contexts.HAS_TEXT: | if context & contexts.HAS_TEXT: | ||||
if context & contexts.FAIL_ON_TEXT: | if context & contexts.FAIL_ON_TEXT: | ||||
if this is self.END or not this.isspace(): | if this is self.END or not this.isspace(): | ||||
if this == "<" and self._read(1) == "!": | |||||
self._context |= contexts.FAIL_NEXT | |||||
return True | |||||
return False | return False | ||||
else: | |||||
if this == "\n": | |||||
self._context |= contexts.FAIL_ON_TEXT | |||||
elif this == "\n": | |||||
self._context |= contexts.FAIL_ON_TEXT | |||||
elif this is self.END or not this.isspace(): | elif this is self.END or not this.isspace(): | ||||
self._context |= contexts.HAS_TEXT | self._context |= contexts.HAS_TEXT | ||||
return True | return True | ||||
@@ -27,8 +27,10 @@ reflect changes made to the main list, and vice-versa. | |||||
""" | """ | ||||
from __future__ import unicode_literals | from __future__ import unicode_literals | ||||
from sys import maxsize | |||||
from weakref import ref | |||||
from .compat import maxsize, py3k | |||||
from .compat import py3k | |||||
__all__ = ["SmartList"] | __all__ = ["SmartList"] | ||||
@@ -45,16 +47,16 @@ def inheritdoc(method): | |||||
class _SliceNormalizerMixIn(object): | class _SliceNormalizerMixIn(object): | ||||
"""MixIn that provides a private method to normalize slices.""" | """MixIn that provides a private method to normalize slices.""" | ||||
def _normalize_slice(self, key): | |||||
def _normalize_slice(self, key, clamp=False): | |||||
"""Return a slice equivalent to the input *key*, standardized.""" | """Return a slice equivalent to the input *key*, standardized.""" | ||||
if key.start is not None: | |||||
if key.start is None: | |||||
start = 0 | |||||
else: | |||||
start = (len(self) + key.start) if key.start < 0 else key.start | start = (len(self) + key.start) if key.start < 0 else key.start | ||||
if key.stop is None or key.stop == maxsize: | |||||
stop = len(self) if clamp else None | |||||
else: | else: | ||||
start = 0 | |||||
if key.stop is not None: | |||||
stop = (len(self) + key.stop) if key.stop < 0 else key.stop | stop = (len(self) + key.stop) if key.stop < 0 else key.stop | ||||
else: | |||||
stop = maxsize | |||||
return slice(start, stop, key.step or 1) | return slice(start, stop, key.step or 1) | ||||
@@ -80,13 +82,6 @@ class SmartList(_SliceNormalizerMixIn, list): | |||||
[2, 3, 4] | [2, 3, 4] | ||||
>>> parent | >>> parent | ||||
[0, 1, 2, 3, 4] | [0, 1, 2, 3, 4] | ||||
The parent needs to keep a list of its children in order to update them, | |||||
which prevents them from being garbage-collected. If you are keeping the | |||||
parent around for a while but creating many children, it is advisable to | |||||
call :meth:`._ListProxy.detach` when you're finished with them. Certain | |||||
parent methods, like :meth:`reverse` and :meth:`sort`, will do this | |||||
automatically. | |||||
""" | """ | ||||
def __init__(self, iterable=None): | def __init__(self, iterable=None): | ||||
@@ -99,10 +94,11 @@ class SmartList(_SliceNormalizerMixIn, list): | |||||
def __getitem__(self, key): | def __getitem__(self, key): | ||||
if not isinstance(key, slice): | if not isinstance(key, slice): | ||||
return super(SmartList, self).__getitem__(key) | return super(SmartList, self).__getitem__(key) | ||||
key = self._normalize_slice(key) | |||||
key = self._normalize_slice(key, clamp=False) | |||||
sliceinfo = [key.start, key.stop, key.step] | sliceinfo = [key.start, key.stop, key.step] | ||||
child = _ListProxy(self, sliceinfo) | child = _ListProxy(self, sliceinfo) | ||||
self._children[id(child)] = (child, sliceinfo) | |||||
child_ref = ref(child, self._delete_child) | |||||
self._children[id(child_ref)] = (child_ref, sliceinfo) | |||||
return child | return child | ||||
def __setitem__(self, key, item): | def __setitem__(self, key, item): | ||||
@@ -110,20 +106,21 @@ class SmartList(_SliceNormalizerMixIn, list): | |||||
return super(SmartList, self).__setitem__(key, item) | return super(SmartList, self).__setitem__(key, item) | ||||
item = list(item) | item = list(item) | ||||
super(SmartList, self).__setitem__(key, item) | super(SmartList, self).__setitem__(key, item) | ||||
key = self._normalize_slice(key) | |||||
key = self._normalize_slice(key, clamp=True) | |||||
diff = len(item) + (key.start - key.stop) // key.step | diff = len(item) + (key.start - key.stop) // key.step | ||||
if not diff: | |||||
return | |||||
values = self._children.values if py3k else self._children.itervalues | values = self._children.values if py3k else self._children.itervalues | ||||
if diff: | |||||
for child, (start, stop, step) in values(): | |||||
if start > key.stop: | |||||
self._children[id(child)][1][0] += diff | |||||
if stop >= key.stop and stop != maxsize: | |||||
self._children[id(child)][1][1] += diff | |||||
for child, (start, stop, step) in values(): | |||||
if start > key.stop: | |||||
self._children[id(child)][1][0] += diff | |||||
if stop is not None and stop >= key.stop: | |||||
self._children[id(child)][1][1] += diff | |||||
def __delitem__(self, key): | def __delitem__(self, key): | ||||
super(SmartList, self).__delitem__(key) | super(SmartList, self).__delitem__(key) | ||||
if isinstance(key, slice): | if isinstance(key, slice): | ||||
key = self._normalize_slice(key) | |||||
key = self._normalize_slice(key, clamp=True) | |||||
else: | else: | ||||
key = slice(key, key + 1, 1) | key = slice(key, key + 1, 1) | ||||
diff = (key.stop - key.start) // key.step | diff = (key.stop - key.start) // key.step | ||||
@@ -131,7 +128,7 @@ class SmartList(_SliceNormalizerMixIn, list): | |||||
for child, (start, stop, step) in values(): | for child, (start, stop, step) in values(): | ||||
if start > key.start: | if start > key.start: | ||||
self._children[id(child)][1][0] -= diff | self._children[id(child)][1][0] -= diff | ||||
if stop >= key.stop and stop != maxsize: | |||||
if stop is not None and stop >= key.stop: | |||||
self._children[id(child)][1][1] -= diff | self._children[id(child)][1][1] -= diff | ||||
if not py3k: | if not py3k: | ||||
@@ -154,10 +151,16 @@ class SmartList(_SliceNormalizerMixIn, list): | |||||
self.extend(other) | self.extend(other) | ||||
return self | return self | ||||
def _delete_child(self, child_ref): | |||||
"""Remove a child reference that is about to be garbage-collected.""" | |||||
del self._children[id(child_ref)] | |||||
def _detach_children(self): | def _detach_children(self): | ||||
"""Remove all children and give them independent parent copies.""" | |||||
children = [val[0] for val in self._children.values()] | children = [val[0] for val in self._children.values()] | ||||
for child in children: | for child in children: | ||||
child.detach() | |||||
child()._parent = list(self) | |||||
self._children.clear() | |||||
@inheritdoc | @inheritdoc | ||||
def append(self, item): | def append(self, item): | ||||
@@ -226,7 +229,6 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||||
super(_ListProxy, self).__init__() | super(_ListProxy, self).__init__() | ||||
self._parent = parent | self._parent = parent | ||||
self._sliceinfo = sliceinfo | self._sliceinfo = sliceinfo | ||||
self._detached = False | |||||
def __repr__(self): | def __repr__(self): | ||||
return repr(self._render()) | return repr(self._render()) | ||||
@@ -273,24 +275,20 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||||
def __getitem__(self, key): | def __getitem__(self, key): | ||||
if isinstance(key, slice): | if isinstance(key, slice): | ||||
key = self._normalize_slice(key) | |||||
if key.stop == maxsize: | |||||
keystop = self._stop | |||||
else: | |||||
keystop = key.stop + self._start | |||||
adjusted = slice(key.start + self._start, keystop, key.step) | |||||
key = self._normalize_slice(key, clamp=True) | |||||
keystart = min(self._start + key.start, self._stop) | |||||
keystop = min(self._start + key.stop, self._stop) | |||||
adjusted = slice(keystart, keystop, key.step) | |||||
return self._parent[adjusted] | return self._parent[adjusted] | ||||
else: | else: | ||||
return self._render()[key] | return self._render()[key] | ||||
def __setitem__(self, key, item): | def __setitem__(self, key, item): | ||||
if isinstance(key, slice): | if isinstance(key, slice): | ||||
key = self._normalize_slice(key) | |||||
if key.stop == maxsize: | |||||
keystop = self._stop | |||||
else: | |||||
keystop = key.stop + self._start | |||||
adjusted = slice(key.start + self._start, keystop, key.step) | |||||
key = self._normalize_slice(key, clamp=True) | |||||
keystart = min(self._start + key.start, self._stop) | |||||
keystop = min(self._start + key.stop, self._stop) | |||||
adjusted = slice(keystart, keystop, key.step) | |||||
self._parent[adjusted] = item | self._parent[adjusted] = item | ||||
else: | else: | ||||
length = len(self) | length = len(self) | ||||
@@ -302,12 +300,10 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||||
def __delitem__(self, key): | def __delitem__(self, key): | ||||
if isinstance(key, slice): | if isinstance(key, slice): | ||||
key = self._normalize_slice(key) | |||||
if key.stop == maxsize: | |||||
keystop = self._stop | |||||
else: | |||||
keystop = key.stop + self._start | |||||
adjusted = slice(key.start + self._start, keystop, key.step) | |||||
key = self._normalize_slice(key, clamp=True) | |||||
keystart = min(self._start + key.start, self._stop) | |||||
keystop = min(self._start + key.stop, self._stop) | |||||
adjusted = slice(keystart, keystop, key.step) | |||||
del self._parent[adjusted] | del self._parent[adjusted] | ||||
else: | else: | ||||
length = len(self) | length = len(self) | ||||
@@ -370,7 +366,7 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||||
@property | @property | ||||
def _stop(self): | def _stop(self): | ||||
"""The ending index of this list, exclusive.""" | """The ending index of this list, exclusive.""" | ||||
if self._sliceinfo[1] == maxsize: | |||||
if self._sliceinfo[1] is None: | |||||
return len(self._parent) | return len(self._parent) | ||||
return self._sliceinfo[1] | return self._sliceinfo[1] | ||||
@@ -456,17 +452,5 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||||
item.sort(**kwargs) | item.sort(**kwargs) | ||||
self._parent[self._start:self._stop:self._step] = item | self._parent[self._start:self._stop:self._step] = item | ||||
def detach(self): | |||||
"""Detach the child so it operates like a normal list. | |||||
This allows children to be properly garbage-collected if their parent | |||||
is being kept around for a long time. This method has no effect if the | |||||
child is already detached. | |||||
""" | |||||
if not self._detached: | |||||
self._parent._children.pop(id(self)) | |||||
self._parent = list(self._parent) | |||||
self._detached = True | |||||
del inheritdoc | del inheritdoc |
@@ -0,0 +1,3 @@ | |||||
This directory contains support files used for *developing* mwparserfromhell, | |||||
not running it. If you are looking for code examples, read the documentation | |||||
or explore the source code. |
@@ -31,10 +31,17 @@ update_version() { | |||||
echo " done." | echo " done." | ||||
} | } | ||||
update_appveyor() { | |||||
filename="appveyor.yml" | |||||
echo -n "Updating $filename..." | |||||
sed -e "s/version: .*/version: $VERSION-b{build}/" -i "" $filename | |||||
echo " done." | |||||
} | |||||
update_changelog() { | update_changelog() { | ||||
filename="CHANGELOG" | filename="CHANGELOG" | ||||
echo -n "Updating $filename..." | echo -n "Updating $filename..." | ||||
sed -e '1s/.*/v'$VERSION' (released '$RELEASE_DATE'):/' -i "" $filename | |||||
sed -e "1s/.*/v$VERSION (released $RELEASE_DATE):/" -i "" $filename | |||||
echo " done." | echo " done." | ||||
} | } | ||||
@@ -45,10 +52,10 @@ update_docs_changelog() { | |||||
previous_lineno=$(expr $(grep -n -e "^---" $filename | sed '2q;d' | cut -d ':' -f 1) - 1) | previous_lineno=$(expr $(grep -n -e "^---" $filename | sed '2q;d' | cut -d ':' -f 1) - 1) | ||||
previous_version=$(sed $previous_lineno'q;d' $filename) | previous_version=$(sed $previous_lineno'q;d' $filename) | ||||
sed \ | sed \ | ||||
-e '4s/.*/v'$VERSION \ | |||||
-e '5s/.*/'$dashes \ | |||||
-e '7s/.*/`Released '$RELEASE_DATE' <https:\/\/github.com\/earwig\/mwparserfromhell\/tree\/v'$VERSION'>`_/' \ | |||||
-e '8s/.*/(`changes <https:\/\/github.com\/earwig\/mwparserfromhell\/compare\/v'$previous_version'...v'$VERSION'>`__):/' \ | |||||
-e "4s/.*/v$VERSION/" \ | |||||
-e "5s/.*/$dashes/" \ | |||||
-e "7s/.*/\`Released $RELEASE_DATE <https:\/\/github.com\/earwig\/mwparserfromhell\/tree\/v$VERSION>\`_/" \ | |||||
-e "8s/.*/(\`changes <https:\/\/github.com\/earwig\/mwparserfromhell\/compare\/$previous_version...v$VERSION>\`__):/" \ | |||||
-i "" $filename | -i "" $filename | ||||
echo " done." | echo " done." | ||||
} | } | ||||
@@ -67,25 +74,18 @@ do_git_stuff() { | |||||
} | } | ||||
upload_to_pypi() { | upload_to_pypi() { | ||||
# TODO: check whether these commands give output | |||||
echo -n "PyPI: uploading source tarball and docs..." | echo -n "PyPI: uploading source tarball and docs..." | ||||
python setup.py register sdist upload -s | |||||
python setup.py upload_docs | |||||
python setup.py -q register sdist upload -s | |||||
python setup.py -q upload_docs | |||||
echo " done." | echo " done." | ||||
} | } | ||||
windows_build() { | |||||
echo "PyPI: building/uploading Windows binaries..." | |||||
echo "*** Run in Windows: ./scripts/win_build.py" | |||||
echo "*** Press enter when done." | |||||
read | |||||
} | |||||
post_release() { | post_release() { | ||||
echo | echo | ||||
echo "*** Release completed." | echo "*** Release completed." | ||||
echo "*** Update: https://github.com/earwig/mwparserfromhell/releases/tag/v$VERSION" | echo "*** Update: https://github.com/earwig/mwparserfromhell/releases/tag/v$VERSION" | ||||
echo "*** Verify: https://pypi.python.org/pypi/mwparserfromhell" | echo "*** Verify: https://pypi.python.org/pypi/mwparserfromhell" | ||||
echo "*** Verify: https://ci.appveyor.com/project/earwig/mwparserfromhell" | |||||
echo "*** Verify: https://mwparserfromhell.readthedocs.org" | echo "*** Verify: https://mwparserfromhell.readthedocs.org" | ||||
echo "*** Press enter to sanity-check the release." | echo "*** Press enter to sanity-check the release." | ||||
read | read | ||||
@@ -153,11 +153,11 @@ cd "$SCRIPT_DIR/.." | |||||
check_git | check_git | ||||
update_version | update_version | ||||
update_appveyor | |||||
update_changelog | update_changelog | ||||
update_docs_changelog | update_docs_changelog | ||||
do_git_stuff | do_git_stuff | ||||
upload_to_pypi | upload_to_pypi | ||||
windows_build | |||||
post_release | post_release | ||||
test_release | test_release | ||||
@@ -1,58 +0,0 @@ | |||||
# Build requirements: | |||||
# | |||||
# Python 2.6-3.2: Visual C++ Express Edition 2008: | |||||
# http://go.microsoft.com/?linkid=7729279 | |||||
# | |||||
# Python 3.3+: Visual C++ Express Edition 2010: | |||||
# http://go.microsoft.com/?linkid=9709949 | |||||
# | |||||
# x64 builds: Microsoft Windows SDK for Windows 7 and .NET Framework 3.5 SP1: | |||||
# http://www.microsoft.com/en-us/download/details.aspx?id=3138 | |||||
# | |||||
# Python interpreter, 2.6, 2.7, 3.2-3.4: | |||||
# https://www.python.org/downloads/ | |||||
# | |||||
# Pip, setuptools, wheel: | |||||
# https://bootstrap.pypa.io/get-pip.py | |||||
# and run *for each* Python version: | |||||
# c:\pythonXX\python get-pip.py | |||||
# c:\pythonXX\scripts\pip install wheel | |||||
# | |||||
# Afterwards, run this script with any of the python interpreters (2.7 suggested) | |||||
from __future__ import print_function | |||||
import os | |||||
from subprocess import call, STDOUT | |||||
ENVIRONMENTS = ["26", "27", "32", "33", "34"] | |||||
def run(pyver, cmds): | |||||
cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds | |||||
print(" ".join(cmd), end=" ") | |||||
with open("%s%s.log" % (cmds[0], pyver), "w") as logfile: | |||||
retval = call(cmd, stdout=logfile, stderr=STDOUT, cwd="..") | |||||
if not retval: | |||||
print("[OK]") | |||||
else: | |||||
print("[FAILED (%i)]" % retval) | |||||
return retval | |||||
def main(): | |||||
path = os.path.split(__file__)[0] | |||||
if path: | |||||
os.chdir(path) | |||||
print("Building Windows wheels for Python %s:" % ", ".join(ENVIRONMENTS)) | |||||
for pyver in ENVIRONMENTS: | |||||
print() | |||||
try: | |||||
os.unlink("mwparserfromhell/parser/_tokenizer.pyd") | |||||
except OSError: | |||||
pass | |||||
if run(pyver, ["test"]) == 0: | |||||
run(pyver, ["bdist_wheel", "upload"]) # TODO: add "-s" to GPG sign | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,43 @@ | |||||
:: To build extensions for 64 bit Python 3, we need to configure environment | |||||
:: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: | |||||
:: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) | |||||
:: | |||||
:: To build extensions for 64 bit Python 2, we need to configure environment | |||||
:: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: | |||||
:: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) | |||||
:: | |||||
:: 32 bit builds do not require specific environment configurations. | |||||
:: | |||||
:: Note: this script needs to be run with the /E:ON and /V:ON flags for the | |||||
:: cmd interpreter, at least for (SDK v7.0) | |||||
:: | |||||
:: More details at: | |||||
:: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows | |||||
:: http://stackoverflow.com/a/13751649/163740 | |||||
:: | |||||
:: Author: Olivier Grisel | |||||
:: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ | |||||
@ECHO OFF | |||||
SET COMMAND_TO_RUN=%* | |||||
SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows | |||||
SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%" | |||||
IF %MAJOR_PYTHON_VERSION% == "2" ( | |||||
SET WINDOWS_SDK_VERSION="v7.0" | |||||
) ELSE IF %MAJOR_PYTHON_VERSION% == "3" ( | |||||
SET WINDOWS_SDK_VERSION="v7.1" | |||||
) ELSE ( | |||||
ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" | |||||
EXIT 1 | |||||
) | |||||
IF "%PYTHON_ARCH%"=="64" ( | |||||
SET DISTUTILS_USE_SDK=1 | |||||
SET MSSdk=1 | |||||
"%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% | |||||
"%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release | |||||
call %COMMAND_TO_RUN% || EXIT 1 | |||||
) ELSE ( | |||||
call %COMMAND_TO_RUN% || EXIT 1 | |||||
) |
@@ -21,88 +21,67 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
import os | |||||
from __future__ import print_function | |||||
from distutils.errors import DistutilsError, CCompilerError | |||||
from glob import glob | |||||
from os import environ | |||||
import sys | import sys | ||||
if (sys.version_info[0] == 2 and sys.version_info[1] < 6) or \ | |||||
(sys.version_info[1] == 3 and sys.version_info[1] < 2): | |||||
raise Exception("mwparserfromhell needs Python 2.6+ or 3.2+") | |||||
if sys.version_info >= (3, 0): | |||||
basestring = (str, ) | |||||
if ((sys.version_info[0] == 2 and sys.version_info[1] < 6) or | |||||
(sys.version_info[1] == 3 and sys.version_info[1] < 2)): | |||||
raise RuntimeError("mwparserfromhell needs Python 2.6+ or 3.2+") | |||||
from setuptools import setup, find_packages, Extension | from setuptools import setup, find_packages, Extension | ||||
from setuptools.command.build_ext import build_ext | |||||
from mwparserfromhell import __version__ | from mwparserfromhell import __version__ | ||||
from mwparserfromhell.compat import py26, py3k | from mwparserfromhell.compat import py26, py3k | ||||
with open("README.rst", **{'encoding':'utf-8'} if py3k else {}) as fp: | |||||
with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp: | |||||
long_docs = fp.read() | long_docs = fp.read() | ||||
tokenizer = Extension("mwparserfromhell.parser._tokenizer", | |||||
sources=["mwparserfromhell/parser/tokenizer.c"], | |||||
depends=["mwparserfromhell/parser/tokenizer.h"]) | |||||
use_extension = True | |||||
fallback = True | |||||
use_extension=True | |||||
# Allow env var WITHOUT_EXTENSION and args --with[out]-extension: | |||||
# Allow env var WITHOUT_EXTENSION and args --with[out]-extension | |||||
if '--without-extension' in sys.argv: | |||||
use_extension = False | |||||
elif '--with-extension' in sys.argv: | |||||
pass | |||||
elif os.environ.get('WITHOUT_EXTENSION', '0') == '1': | |||||
env_var = environ.get("WITHOUT_EXTENSION") | |||||
if "--without-extension" in sys.argv: | |||||
use_extension = False | use_extension = False | ||||
elif "--with-extension" in sys.argv: | |||||
fallback = False | |||||
elif env_var is not None: | |||||
if env_var == "1": | |||||
use_extension = False | |||||
elif env_var == "0": | |||||
fallback = False | |||||
# Remove the command line argument as it isnt understood by | |||||
# setuptools/distutils | |||||
sys.argv = [arg for arg in sys.argv | |||||
if not arg.startswith('--with') | |||||
and not arg.endswith('-extension')] | |||||
# Remove the command line argument as it isn't understood by setuptools: | |||||
def optional_compile_setup(func=setup, use_ext=use_extension, | |||||
*args, **kwargs): | |||||
""" | |||||
Wrap setup to allow optional compilation of extensions. | |||||
Falls back to pure python mode (no extensions) | |||||
if compilation of extensions fails. | |||||
""" | |||||
extensions = kwargs.get('ext_modules', None) | |||||
if use_ext and extensions: | |||||
try: | |||||
func(*args, **kwargs) | |||||
return | |||||
except SystemExit as e: | |||||
assert(e.args) | |||||
if e.args[0] is False: | |||||
raise | |||||
elif isinstance(e.args[0], basestring): | |||||
if e.args[0].startswith('usage: '): | |||||
raise | |||||
else: | |||||
# Fallback to pure python mode | |||||
print('setup with extension failed: %s' % repr(e)) | |||||
pass | |||||
except Exception as e: | |||||
print('setup with extension failed: %s' % repr(e)) | |||||
sys.argv = [arg for arg in sys.argv | |||||
if arg != "--without-extension" and arg != "--with-extension"] | |||||
if extensions: | |||||
if use_ext: | |||||
print('Falling back to pure python mode.') | |||||
else: | |||||
print('Using pure python mode.') | |||||
def build_ext_patched(self): | |||||
try: | |||||
build_ext_original(self) | |||||
except (DistutilsError, CCompilerError) as exc: | |||||
print("error: " + str(exc)) | |||||
print("Falling back to pure Python mode.") | |||||
del self.extensions[:] | |||||
del kwargs['ext_modules'] | |||||
if fallback: | |||||
build_ext.run, build_ext_original = build_ext_patched, build_ext.run | |||||
func(*args, **kwargs) | |||||
# Project-specific part begins here: | |||||
tokenizer = Extension("mwparserfromhell.parser._tokenizer", | |||||
sources=glob("mwparserfromhell/parser/ctokenizer/*.c"), | |||||
depends=glob("mwparserfromhell/parser/ctokenizer/*.h")) | |||||
optional_compile_setup( | |||||
setup( | |||||
name = "mwparserfromhell", | name = "mwparserfromhell", | ||||
packages = find_packages(exclude=("tests",)), | packages = find_packages(exclude=("tests",)), | ||||
ext_modules = [tokenizer], | |||||
ext_modules = [tokenizer] if use_extension else [], | |||||
tests_require = ["unittest2"] if py26 else [], | tests_require = ["unittest2"] if py26 else [], | ||||
test_suite = "tests.discover", | test_suite = "tests.discover", | ||||
version = __version__, | version = __version__, | ||||
@@ -126,6 +105,7 @@ optional_compile_setup( | |||||
"Programming Language :: Python :: 3.2", | "Programming Language :: Python :: 3.2", | ||||
"Programming Language :: Python :: 3.3", | "Programming Language :: Python :: 3.3", | ||||
"Programming Language :: Python :: 3.4", | "Programming Language :: Python :: 3.4", | ||||
"Programming Language :: Python :: 3.5", | |||||
"Topic :: Text Processing :: Markup" | "Topic :: Text Processing :: Markup" | ||||
], | ], | ||||
) | ) |
@@ -42,8 +42,8 @@ class TokenizerTestCase(object): | |||||
directory. | directory. | ||||
""" | """ | ||||
@classmethod | |||||
def _build_test_method(cls, funcname, data): | |||||
@staticmethod | |||||
def _build_test_method(funcname, data): | |||||
"""Create and return a method to be treated as a test case method. | """Create and return a method to be treated as a test case method. | ||||
*data* is a dict containing multiple keys: the *input* text to be | *data* is a dict containing multiple keys: the *input* text to be | ||||
@@ -58,13 +58,35 @@ class TokenizerTestCase(object): | |||||
expected = data["output"] | expected = data["output"] | ||||
actual = self.tokenizer().tokenize(data["input"]) | actual = self.tokenizer().tokenize(data["input"]) | ||||
self.assertEqual(expected, actual) | self.assertEqual(expected, actual) | ||||
if not py3k: | if not py3k: | ||||
inner.__name__ = funcname.encode("utf8") | inner.__name__ = funcname.encode("utf8") | ||||
inner.__doc__ = data["label"] | inner.__doc__ = data["label"] | ||||
return inner | return inner | ||||
@staticmethod | |||||
def _parse_test(test, data): | |||||
"""Parse an individual *test*, storing its info in *data*.""" | |||||
for line in test.strip().splitlines(): | |||||
if line.startswith("name:"): | |||||
data["name"] = line[len("name:"):].strip() | |||||
elif line.startswith("label:"): | |||||
data["label"] = line[len("label:"):].strip() | |||||
elif line.startswith("input:"): | |||||
raw = line[len("input:"):].strip() | |||||
if raw[0] == '"' and raw[-1] == '"': | |||||
raw = raw[1:-1] | |||||
raw = raw.encode("raw_unicode_escape") | |||||
data["input"] = raw.decode("unicode_escape") | |||||
elif line.startswith("output:"): | |||||
raw = line[len("output:"):].strip() | |||||
try: | |||||
data["output"] = eval(raw, vars(tokens)) | |||||
except Exception as err: | |||||
raise _TestParseError(err) | |||||
@classmethod | @classmethod | ||||
def _load_tests(cls, filename, name, text): | |||||
def _load_tests(cls, filename, name, text, restrict=None): | |||||
"""Load all tests in *text* from the file *filename*.""" | """Load all tests in *text* from the file *filename*.""" | ||||
tests = text.split("\n---\n") | tests = text.split("\n---\n") | ||||
counter = 1 | counter = 1 | ||||
@@ -72,23 +94,7 @@ class TokenizerTestCase(object): | |||||
for test in tests: | for test in tests: | ||||
data = {"name": None, "label": None, "input": None, "output": None} | data = {"name": None, "label": None, "input": None, "output": None} | ||||
try: | try: | ||||
for line in test.strip().splitlines(): | |||||
if line.startswith("name:"): | |||||
data["name"] = line[len("name:"):].strip() | |||||
elif line.startswith("label:"): | |||||
data["label"] = line[len("label:"):].strip() | |||||
elif line.startswith("input:"): | |||||
raw = line[len("input:"):].strip() | |||||
if raw[0] == '"' and raw[-1] == '"': | |||||
raw = raw[1:-1] | |||||
raw = raw.encode("raw_unicode_escape") | |||||
data["input"] = raw.decode("unicode_escape") | |||||
elif line.startswith("output:"): | |||||
raw = line[len("output:"):].strip() | |||||
try: | |||||
data["output"] = eval(raw, vars(tokens)) | |||||
except Exception as err: | |||||
raise _TestParseError(err) | |||||
cls._parse_test(test, data) | |||||
except _TestParseError as err: | except _TestParseError as err: | ||||
if data["name"]: | if data["name"]: | ||||
error = "Could not parse test '{0}' in '{1}':\n\t{2}" | error = "Could not parse test '{0}' in '{1}':\n\t{2}" | ||||
@@ -97,6 +103,7 @@ class TokenizerTestCase(object): | |||||
error = "Could not parse a test in '{0}':\n\t{1}" | error = "Could not parse a test in '{0}':\n\t{1}" | ||||
print(error.format(filename, err)) | print(error.format(filename, err)) | ||||
continue | continue | ||||
if not data["name"]: | if not data["name"]: | ||||
error = "A test in '{0}' was ignored because it lacked a name" | error = "A test in '{0}' was ignored because it lacked a name" | ||||
print(error.format(filename)) | print(error.format(filename)) | ||||
@@ -105,27 +112,35 @@ class TokenizerTestCase(object): | |||||
error = "Test '{0}' in '{1}' was ignored because it lacked an input or an output" | error = "Test '{0}' in '{1}' was ignored because it lacked an input or an output" | ||||
print(error.format(data["name"], filename)) | print(error.format(data["name"], filename)) | ||||
continue | continue | ||||
number = str(counter).zfill(digits) | number = str(counter).zfill(digits) | ||||
counter += 1 | |||||
if restrict and data["name"] != restrict: | |||||
continue | |||||
fname = "test_{0}{1}_{2}".format(name, number, data["name"]) | fname = "test_{0}{1}_{2}".format(name, number, data["name"]) | ||||
meth = cls._build_test_method(fname, data) | meth = cls._build_test_method(fname, data) | ||||
setattr(cls, fname, meth) | setattr(cls, fname, meth) | ||||
counter += 1 | |||||
@classmethod | @classmethod | ||||
def build(cls): | def build(cls): | ||||
"""Load and install all tests from the 'tokenizer' directory.""" | """Load and install all tests from the 'tokenizer' directory.""" | ||||
def load_file(filename): | |||||
def load_file(filename, restrict=None): | |||||
with codecs.open(filename, "rU", encoding="utf8") as fp: | with codecs.open(filename, "rU", encoding="utf8") as fp: | ||||
text = fp.read() | text = fp.read() | ||||
name = path.split(filename)[1][:0-len(extension)] | |||||
cls._load_tests(filename, name, text) | |||||
name = path.split(filename)[1][:-len(extension)] | |||||
cls._load_tests(filename, name, text, restrict) | |||||
directory = path.join(path.dirname(__file__), "tokenizer") | directory = path.join(path.dirname(__file__), "tokenizer") | ||||
extension = ".mwtest" | extension = ".mwtest" | ||||
if len(sys.argv) > 2 and sys.argv[1] == "--use": | if len(sys.argv) > 2 and sys.argv[1] == "--use": | ||||
for name in sys.argv[2:]: | for name in sys.argv[2:]: | ||||
load_file(path.join(directory, name + extension)) | |||||
sys.argv = [sys.argv[0]] # So unittest doesn't try to load these | |||||
if "." in name: | |||||
name, test = name.split(".", 1) | |||||
else: | |||||
test = None | |||||
load_file(path.join(directory, name + extension), test) | |||||
sys.argv = [sys.argv[0]] # So unittest doesn't try to parse this | |||||
cls.skip_others = True | cls.skip_others = True | ||||
else: | else: | ||||
for filename in listdir(directory): | for filename in listdir(directory): | ||||
@@ -115,8 +115,8 @@ class TestDocs(unittest.TestCase): | |||||
@unittest.skipIf("NOWEB" in os.environ, "web test disabled by environ var") | @unittest.skipIf("NOWEB" in os.environ, "web test disabled by environ var") | ||||
def test_readme_5(self): | def test_readme_5(self): | ||||
"""test a block of example code in the README; includes a web call""" | """test a block of example code in the README; includes a web call""" | ||||
url1 = "http://en.wikipedia.org/w/api.php" | |||||
url2 = "http://en.wikipedia.org/w/index.php?title={0}&action=raw" | |||||
url1 = "https://en.wikipedia.org/w/api.php" | |||||
url2 = "https://en.wikipedia.org/w/index.php?title={0}&action=raw" | |||||
title = "Test" | title = "Test" | ||||
data = {"action": "query", "prop": "revisions", "rvlimit": 1, | data = {"action": "query", "prop": "revisions", "rvlimit": 1, | ||||
"rvprop": "content", "format": "json", "titles": title} | "rvprop": "content", "format": "json", "titles": title} | ||||
@@ -52,6 +52,7 @@ class TestSmartList(unittest.TestCase): | |||||
self.assertEqual([0, 1, 2], list1[:3]) | self.assertEqual([0, 1, 2], list1[:3]) | ||||
self.assertEqual([0, 1, 2, 3, "one", "two"], list1[:]) | self.assertEqual([0, 1, 2, 3, "one", "two"], list1[:]) | ||||
self.assertEqual([3, "one", "two"], list1[3:]) | self.assertEqual([3, "one", "two"], list1[3:]) | ||||
self.assertEqual([3, "one", "two"], list1[3:100]) | |||||
self.assertEqual(["one", "two"], list1[-2:]) | self.assertEqual(["one", "two"], list1[-2:]) | ||||
self.assertEqual([0, 1], list1[:-4]) | self.assertEqual([0, 1], list1[:-4]) | ||||
self.assertEqual([], list1[6:]) | self.assertEqual([], list1[6:]) | ||||
@@ -389,28 +390,35 @@ class TestSmartList(unittest.TestCase): | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1) | self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1) | ||||
self.assertEqual([4, 3, 2, 1.9, 1.8], child2) | self.assertEqual([4, 3, 2, 1.9, 1.8], child2) | ||||
child1.detach() | |||||
self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], parent) | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1) | |||||
child3 = parent[9:] | |||||
self.assertEqual([8, 8.1, 8.2], child3) | |||||
del parent[8:] | |||||
self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent) | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6], child1) | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8], child2) | |||||
self.assertEqual([], child3) | |||||
del child1 | |||||
self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent) | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8], child2) | |||||
self.assertEqual([], child3) | |||||
self.assertEqual(2, len(parent._children)) | |||||
del child3 | |||||
self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent) | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8], child2) | self.assertEqual([4, 3, 2, 1.9, 1.8], child2) | ||||
self.assertEqual(1, len(parent._children)) | self.assertEqual(1, len(parent._children)) | ||||
parent.remove(1.9) | parent.remove(1.9) | ||||
parent.remove(1.8) | parent.remove(1.8) | ||||
self.assertEqual([1, 4, 3, 2, 5, 6, 7, 8, 8.1, 8.2], parent) | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1) | |||||
self.assertEqual([1, 4, 3, 2, 5, 6], parent) | |||||
self.assertEqual([4, 3, 2], child2) | self.assertEqual([4, 3, 2], child2) | ||||
parent.reverse() | parent.reverse() | ||||
self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent) | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1) | |||||
self.assertEqual([6, 5, 2, 3, 4, 1], parent) | |||||
self.assertEqual([4, 3, 2], child2) | self.assertEqual([4, 3, 2], child2) | ||||
self.assertEqual(0, len(parent._children)) | self.assertEqual(0, len(parent._children)) | ||||
child2.detach() | |||||
self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent) | |||||
self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1) | |||||
self.assertEqual([4, 3, 2], child2) | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
unittest.main(verbosity=2) | unittest.main(verbosity=2) |
@@ -213,6 +213,9 @@ class TestTemplate(TreeEqualityTestCase): | |||||
pgens("f", "g")]) | pgens("f", "g")]) | ||||
node37 = Template(wraptext("a"), [pgenh("1", "")]) | node37 = Template(wraptext("a"), [pgenh("1", "")]) | ||||
node38 = Template(wraptext("abc")) | node38 = Template(wraptext("abc")) | ||||
node39 = Template(wraptext("a"), [pgenh("1", " b ")]) | |||||
node40 = Template(wraptext("a"), [pgenh("1", " b"), pgenh("2", " c")]) | |||||
node41 = Template(wraptext("a"), [pgens("1", " b"), pgens("2", " c")]) | |||||
node1.add("e", "f", showkey=True) | node1.add("e", "f", showkey=True) | ||||
node2.add(2, "g", showkey=False) | node2.add(2, "g", showkey=False) | ||||
@@ -255,6 +258,9 @@ class TestTemplate(TreeEqualityTestCase): | |||||
node37.add(1, "b") | node37.add(1, "b") | ||||
node38.add("1", "foo") | node38.add("1", "foo") | ||||
self.assertRaises(ValueError, node38.add, "z", "bar", showkey=False) | self.assertRaises(ValueError, node38.add, "z", "bar", showkey=False) | ||||
node39.add("1", "c") | |||||
node40.add("3", "d") | |||||
node41.add("3", "d") | |||||
self.assertEqual("{{a|b=c|d|e=f}}", node1) | self.assertEqual("{{a|b=c|d|e=f}}", node1) | ||||
self.assertEqual("{{a|b=c|d|g}}", node2) | self.assertEqual("{{a|b=c|d|g}}", node2) | ||||
@@ -299,6 +305,9 @@ class TestTemplate(TreeEqualityTestCase): | |||||
self.assertEqual("{{a|b=c|d=h|f=g}}", node36) | self.assertEqual("{{a|b=c|d=h|f=g}}", node36) | ||||
self.assertEqual("{{a|b}}", node37) | self.assertEqual("{{a|b}}", node37) | ||||
self.assertEqual("{{abc|foo}}", node38) | self.assertEqual("{{abc|foo}}", node38) | ||||
self.assertEqual("{{a|c}}", node39) | |||||
self.assertEqual("{{a| b| c|d}}", node40) | |||||
self.assertEqual("{{a|1= b|2= c|3= d}}", node41) | |||||
def test_remove(self): | def test_remove(self): | ||||
"""test Template.remove()""" | """test Template.remove()""" | ||||
@@ -395,13 +404,13 @@ class TestTemplate(TreeEqualityTestCase): | |||||
self.assertRaises(ValueError, node2.remove, "1") | self.assertRaises(ValueError, node2.remove, "1") | ||||
self.assertEqual("{{foo}}", node2) | self.assertEqual("{{foo}}", node2) | ||||
self.assertEqual("{{foo||abc=}}", node3) | self.assertEqual("{{foo||abc=}}", node3) | ||||
self.assertEqual("{{foo||baz}}", node4) | |||||
self.assertEqual("{{foo|2=baz}}", node4) | |||||
self.assertEqual("{{foo|b=c}}", node5) | self.assertEqual("{{foo|b=c}}", node5) | ||||
self.assertEqual("{{foo| a=|b=c}}", node6) | self.assertEqual("{{foo| a=|b=c}}", node6) | ||||
self.assertEqual("{{foo|1 =|2=c}}", node7) | self.assertEqual("{{foo|1 =|2=c}}", node7) | ||||
self.assertEqual("{{foo|2=c}}", node8) | self.assertEqual("{{foo|2=c}}", node8) | ||||
self.assertEqual("{{foo||c}}", node9) | self.assertEqual("{{foo||c}}", node9) | ||||
self.assertEqual("{{foo||c}}", node10) | |||||
self.assertEqual("{{foo|2=c}}", node10) | |||||
self.assertEqual("{{foo|b=c|a =d}}", node11) | self.assertEqual("{{foo|b=c|a =d}}", node11) | ||||
self.assertEqual("{{foo| a=|b=c|a =d}}", node12) | self.assertEqual("{{foo| a=|b=c|a =d}}", node12) | ||||
self.assertEqual("{{foo| a=b|a =d}}", node13) | self.assertEqual("{{foo| a=b|a =d}}", node13) | ||||
@@ -410,7 +419,7 @@ class TestTemplate(TreeEqualityTestCase): | |||||
self.assertEqual("{{foo| a=b|b=c|a =}}", node16) | self.assertEqual("{{foo| a=b|b=c|a =}}", node16) | ||||
self.assertEqual("{{foo|b|c}}", node17) | self.assertEqual("{{foo|b|c}}", node17) | ||||
self.assertEqual("{{foo|1 =|b|c}}", node18) | self.assertEqual("{{foo|1 =|b|c}}", node18) | ||||
self.assertEqual("{{foo|1 =a||c}}", node19) | |||||
self.assertEqual("{{foo|1 =a|2=c}}", node19) | |||||
self.assertEqual("{{foo|1 =a||c}}", node20) | self.assertEqual("{{foo|1 =a||c}}", node20) | ||||
self.assertEqual("{{foo|c=d|e=f}}", node21) | self.assertEqual("{{foo|c=d|e=f}}", node21) | ||||
self.assertEqual("{{foo|a=|c=d|e=f}}", node22) | self.assertEqual("{{foo|a=|c=d|e=f}}", node22) | ||||
@@ -241,3 +241,80 @@ name: newline_and_comment_in_template_name_7 | |||||
label: a template name containing a newline followed by a comment | label: a template name containing a newline followed by a comment | ||||
input: "{{foobar\n<!|key=value}}" | input: "{{foobar\n<!|key=value}}" | ||||
output: [Text(text="{{foobar\n<!|key=value}}")] | output: [Text(text="{{foobar\n<!|key=value}}")] | ||||
--- | |||||
name: newline_and_comment_in_template_name_8 | |||||
label: a template name containing a newline followed by a comment | |||||
input: "{{<!-- comment -->\nfoobar\n<!-- comment -->}}" | |||||
output: [TemplateOpen(), CommentStart(), Text(text=" comment "), CommentEnd(), Text(text="\nfoobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), TemplateClose()] | |||||
--- | |||||
name: tag_in_link_title | |||||
label: HTML tags are invalid in link titles, even when complete | |||||
input: "[[foo<i>bar</i>baz]]" | |||||
output: [Text(text="[[foo"), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz]]")] | |||||
--- | |||||
name: tag_in_template_name | |||||
label: HTML tags are invalid in template names, even when complete | |||||
input: "{{foo<i>bar</i>baz}}" | |||||
output: [Text(text="{{foo"), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz}}")] | |||||
--- | |||||
name: tag_in_link_text | |||||
label: HTML tags are valid in link text | |||||
input: "[[foo|<i>bar</i>baz]]" | |||||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), TagOpenOpen(), Text(text="i"), TagCloseOpen(padding=""), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="baz"), WikilinkClose()] | |||||
--- | |||||
name: comment_in_link_title | |||||
label: comments are valid in link titles | |||||
input: "[[foo<!--bar-->baz]]" | |||||
output: [WikilinkOpen(), Text(text="foo"), CommentStart(), Text(text="bar"), CommentEnd(), Text(text="baz"), WikilinkClose()] | |||||
--- | |||||
name: incomplete_comment_in_link_title | |||||
label: incomplete comments are invalid in link titles | |||||
input: "[[foo<!--bar--baz]]" | |||||
output: [Text(text="[[foo<!--bar--baz]]")] | |||||
--- | |||||
name: incomplete_comment_in_link_title_2 | |||||
label: incomplete comments are invalid in link titles | |||||
input: "[[foo<!--barbaz]]" | |||||
output: [Text(text="[[foo<!--barbaz]]")] | |||||
--- | |||||
name: incomplete_comment_in_link_title_3 | |||||
label: incomplete comments are invalid in link titles | |||||
input: "[[foo<!barbaz]]" | |||||
output: [Text(text="[[foo<!barbaz]]")] | |||||
--- | |||||
name: incomplete_comment_in_link_title_4 | |||||
label: incomplete comments are invalid in link titles | |||||
input: "[[foo<!--]]" | |||||
output: [Text(text="[[foo<!--]]")] | |||||
--- | |||||
name: incomplete_comment_in_link_title_5 | |||||
label: incomplete comments are invalid in link titles | |||||
input: "[[foo<!-" | |||||
output: [Text(text="[[foo<!-")] | |||||
--- | |||||
name: incomplete_comment_in_link_title_6 | |||||
label: incomplete comments are invalid in link titles | |||||
input: "[[foo<!--bar" | |||||
output: [Text(text="[[foo<!--bar")] |
@@ -1,17 +1,3 @@ | |||||
name: blank | |||||
label: template with no content | |||||
input: "{{}}" | |||||
output: [TemplateOpen(), TemplateClose()] | |||||
name: blank_with_params | |||||
label: template with no content, but pipes and equal signs | |||||
input: "{{||=|}}" | |||||
output: [TemplateOpen(), TemplateParamSeparator(), TemplateParamSeparator(), TemplateParamEquals(), TemplateParamSeparator(), TemplateClose()] | |||||
name: no_params | name: no_params | ||||
label: simplest type of template | label: simplest type of template | ||||
input: "{{template}}" | input: "{{template}}" | ||||
@@ -61,6 +47,13 @@ output: [TemplateOpen(), Text(text="foo"), TemplateParamSeparator(), Text(text=" | |||||
--- | --- | ||||
name: blank_params | |||||
label: template with blank parameters (mix of pipes and equal signs) | |||||
input: "{{,||=|}}" | |||||
output: [TemplateOpen(), Text(text=","), TemplateParamSeparator(), TemplateParamSeparator(), TemplateParamEquals(), TemplateParamSeparator(), TemplateClose()] | |||||
--- | |||||
name: nested_unnamed_param | name: nested_unnamed_param | ||||
label: nested template as an unnamed parameter | label: nested template as an unnamed parameter | ||||
input: "{{foo|{{bar}}}}" | input: "{{foo|{{bar}}}}" | ||||
@@ -390,6 +383,34 @@ output: [TemplateOpen(), Text(text="foo\n "), TemplateParamSeparator(), Text(te | |||||
--- | --- | ||||
name: invalid_blank | |||||
label: invalid template with no content | |||||
input: "{{}}" | |||||
output: [Text(text="{{}}")] | |||||
--- | |||||
name: invalid_blank_whitespace | |||||
label: invalid template with no content, but whitespace | |||||
input: "{{ }}" | |||||
output: [Text(text="{{ }}")] | |||||
--- | |||||
name: invalid_blank_pipe | |||||
label: invalid template with no content, but a parameter | |||||
input: "{{|foo}}" | |||||
output: [Text(text="{{|foo}}")] | |||||
--- | |||||
name: invalid_blank_whitespace_pipe | |||||
label: invalid template with no content, but whitespace and a parameter | |||||
input: "{{ |foo}}" | |||||
output: [Text(text="{{ |foo}}")] | |||||
--- | |||||
name: invalid_name_left_brace_middle | name: invalid_name_left_brace_middle | ||||
label: invalid characters in template name: left brace in middle | label: invalid characters in template name: left brace in middle | ||||
input: "{{foo{bar}}" | input: "{{foo{bar}}" | ||||
@@ -509,6 +530,20 @@ output: [TemplateOpen(), Text(text="foo"), TemplateParamSeparator(), Text(text=" | |||||
--- | --- | ||||
name: invalid_left_angle_bracket | |||||
label: invalid template: left angle bracket in name | |||||
input: "{{foo<bar}}" | |||||
output: [Text(text="{{foo<bar}}")] | |||||
--- | |||||
name: invalid_right_angle_bracket | |||||
label: invalid template: right angle bracket in name | |||||
input: "{{foo>bar}}" | |||||
output: [Text(text="{{foo>bar}}")] | |||||
--- | |||||
name: incomplete_stub | name: incomplete_stub | ||||
label: incomplete templates that should fail gracefully: just an opening | label: incomplete templates that should fail gracefully: just an opening | ||||
input: "{{" | input: "{{" | ||||
@@ -651,5 +686,5 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ | |||||
name: recursion_opens_and_closes | name: recursion_opens_and_closes | ||||
label: test potentially dangerous recursion: template openings and closings | label: test potentially dangerous recursion: template openings and closings | ||||
input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}" | |||||
output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")] | |||||
input: "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}" | |||||
output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")] |
@@ -27,6 +27,6 @@ output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")] | |||||
--- | --- | ||||
name: large | name: large | ||||
label: a lot of text, requiring multiple textbuffer blocks in the C tokenizer | |||||
label: a lot of text, requiring proper storage in the C tokenizer | |||||
input: "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN" | input: "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN" | ||||
output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")] | output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")] |
@@ -124,6 +124,34 @@ output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] | |||||
--- | --- | ||||
name: invalid_left_angle_bracket | |||||
label: invalid wikilink: left angle bracket | |||||
input: "[[foo<bar]]" | |||||
output: [Text(text="[[foo<bar]]")] | |||||
--- | |||||
name: invalid_right_angle_bracket | |||||
label: invalid wikilink: right angle bracket | |||||
input: "[[foo>bar]]" | |||||
output: [Text(text="[[foo>bar]]")] | |||||
--- | |||||
name: invalid_newline_at_start | |||||
label: invalid wikilink: newline at start of title | |||||
input: "[[\nfoobar]]" | |||||
output: [Text(text="[[\nfoobar]]")] | |||||
--- | |||||
name: invalid_newline_at_end | |||||
label: invalid wikilink: newline at end of title | |||||
input: "[[foobar\n]]" | |||||
output: [Text(text="[[foobar\n]]")] | |||||
--- | |||||
name: incomplete_open_only | name: incomplete_open_only | ||||
label: incomplete wikilinks: just an open | label: incomplete wikilinks: just an open | ||||
input: "[[" | input: "[[" | ||||