From 99cf0a4412139abf3255ca3fe5123aca3c7c14b4 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 22 Apr 2014 16:44:14 -0400 Subject: [PATCH 001/102] Version bump to 0.4.dev. --- CHANGELOG | 4 ++++ docs/changelog.rst | 8 ++++++++ mwparserfromhell/__init__.py | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 9faf6b7..6be48c6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +v0.4 (unreleased): + +- + v0.3.3 (released April 22, 2014): - Added support for Python 2.6 and 3.4. diff --git a/docs/changelog.rst b/docs/changelog.rst index 9efc022..3f2ba0e 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,14 @@ Changelog ========= +v0.4 +---- + +Unreleased +(`changes `__): + +- + v0.3.3 ------ diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 469e9a6..e7459e3 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -31,7 +31,7 @@ from __future__ import unicode_literals __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.3.3" +__version__ = "0.4.dev" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin, From 2fe8826a9dec3d1015ff7a69857fb282617d3a45 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 22 Apr 2014 18:31:00 -0400 Subject: [PATCH 002/102] Added a script to test for memory leaks in scripts/memtest.py. --- CHANGELOG | 2 +- docs/changelog.rst | 2 +- scripts/memtest.py | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 scripts/memtest.py diff --git a/CHANGELOG b/CHANGELOG index 6be48c6..564b09c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ v0.4 (unreleased): -- +- Added a script to test for memory leaks in scripts/memtest.py. v0.3.3 (released April 22, 2014): diff --git a/docs/changelog.rst b/docs/changelog.rst index 3f2ba0e..5a59be0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,7 +7,7 @@ v0.4 Unreleased (`changes `__): -- +- Added a script to test for memory leaks in :file:`scripts/memtest.py`. v0.3.3 ------ diff --git a/scripts/memtest.py b/scripts/memtest.py new file mode 100644 index 0000000..e6b8011 --- /dev/null +++ b/scripts/memtest.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2014 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +Tests for memory leaks in the CTokenizer. Python 2 and 3 compatible. + +This appears to work mostly fine under Linux, but gives an absurd number of +false positives on OS X. I'm not sure why. Running the tests multiple times +yields different results (tests don't always leak, and the amount they leak by +varies). Increasing the number of loops results in a smaller bytes/loop value, +too, indicating the increase in memory usage might be due to something else. +Actual memory leaks typically leak very large amounts of memory (megabytes) +and scale with the number of loops. +""" + +from __future__ import unicode_literals, print_function +from locale import LC_ALL, setlocale +from multiprocessing import Process, Pipe +from os import listdir, path +import sys + +import psutil + +from mwparserfromhell.compat import py3k +from mwparserfromhell.parser._tokenizer import CTokenizer + +if sys.version_info[0] == 2: + range = xrange + +LOOPS = 10000 + +class Color(object): + GRAY = "\x1b[30;1m" + GREEN = "\x1b[92m" + YELLOW = "\x1b[93m" + RESET = "\x1b[0m" + + +class MemoryTest(object): + """Manages a memory test.""" + + def __init__(self): + self._tests = [] + self._load() + + def _parse_file(self, name, text): + tests = text.split("\n---\n") + counter = 1 + digits = len(str(len(tests))) + for test in tests: + data = {"name": None, "label": None, "input": None, "output": None} + for line in test.strip().splitlines(): + if line.startswith("name:"): + data["name"] = line[len("name:"):].strip() + elif line.startswith("label:"): + data["label"] = line[len("label:"):].strip() + elif line.startswith("input:"): + raw = line[len("input:"):].strip() + if raw[0] == '"' and raw[-1] == '"': + raw = raw[1:-1] + raw = raw.encode("raw_unicode_escape") + data["input"] = raw.decode("unicode_escape") + number = str(counter).zfill(digits) + fname = "test_{0}{1}_{2}".format(name, number, data["name"]) + self._tests.append((fname, data["input"])) + counter += 1 + + def _load(self): + def load_file(filename): + with open(filename, "rU") as fp: + text = fp.read() + if not py3k: + text = text.decode("utf8") + name = path.split(filename)[1][:0-len(extension)] + self._parse_file(name, text) + + root = path.split(path.dirname(path.abspath(__file__)))[0] + directory = path.join(root, "tests", "tokenizer") + extension = ".mwtest" + if len(sys.argv) > 2 and sys.argv[1] == "--use": + for name in sys.argv[2:]: + load_file(path.join(directory, name + extension)) + sys.argv = [sys.argv[0]] # So unittest doesn't try to load these + else: + for filename in listdir(directory): + if not filename.endswith(extension): + continue + load_file(path.join(directory, filename)) + + @staticmethod + def _print_results(info1, info2): + r1, r2 = info1.rss, info2.rss + buff = 8192 + if r2 - buff > r1: + d = r2 - r1 + p = float(d) / r1 + bpt = d // LOOPS + tmpl = "{0}LEAKING{1}: {2:n} bytes, {3:.2%} inc ({4:n} bytes/loop)" + sys.stdout.write(tmpl.format(Color.YELLOW, Color.RESET, d, p, bpt)) + else: + sys.stdout.write("{0}OK{1}".format(Color.GREEN, Color.RESET)) + + def run(self): + """Run the memory test suite.""" + width = 1 + for (name, _) in self._tests: + if len(name) > width: + width = len(name) + + tmpl = "{0}[{1:03}/{2}]{3} {4}: " + for i, (name, text) in enumerate(self._tests, 1): + sys.stdout.write(tmpl.format(Color.GRAY, i, len(self._tests), + Color.RESET, name.ljust(width))) + sys.stdout.flush() + parent, child = Pipe() + p = Process(target=_runner, args=(text, child)) + p.start() + try: + proc = psutil.Process(p.pid) + parent.recv() + parent.send("OK") + parent.recv() + info1 = proc.get_memory_info() + sys.stdout.flush() + parent.send("OK") + parent.recv() + info2 = proc.get_memory_info() + self._print_results(info1, info2) + sys.stdout.flush() + parent.send("OK") + finally: + proc.kill() + print() + + +def _runner(text, child): + r1, r2 = range(250), range(LOOPS) + for i in r1: + CTokenizer().tokenize(text) + child.send("OK") + child.recv() + child.send("OK") + child.recv() + for i in r2: + CTokenizer().tokenize(text) + child.send("OK") + child.recv() + +if __name__ == "__main__": + setlocale(LC_ALL, "") + MemoryTest().run() From 5d08e9e316f826e3b1c52e6424b583b6e4dd41c9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 22 Apr 2014 22:35:45 -0400 Subject: [PATCH 003/102] Created a release script; added a MANIFEST.in file. --- CHANGELOG | 1 + MANIFEST.in | 2 + docs/changelog.rst | 1 + scripts/release.sh | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 3 +- 5 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 MANIFEST.in create mode 100755 scripts/release.sh diff --git a/CHANGELOG b/CHANGELOG index 564b09c..98a1f96 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ v0.4 (unreleased): - Added a script to test for memory leaks in scripts/memtest.py. +- Added a script to do releases in scripts/release.sh. v0.3.3 (released April 22, 2014): diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..27e8a54 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include LICENSE CHANGELOG +recursive-include tests *.py *.mwtest diff --git a/docs/changelog.rst b/docs/changelog.rst index 5a59be0..3ce507e 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,6 +8,7 @@ Unreleased (`changes `__): - Added a script to test for memory leaks in :file:`scripts/memtest.py`. +- Added a script to do releases in :file:`scripts/release.sh`. v0.3.3 ------ diff --git a/scripts/release.sh b/scripts/release.sh new file mode 100755 index 0000000..c10871d --- /dev/null +++ b/scripts/release.sh @@ -0,0 +1,155 @@ +#! /usr/bin/env bash + +if [[ -z "$1" ]]; then + echo "usage: $0 1.2.3" + exit 1 +fi + +VERSION=$1 +SCRIPT_DIR=$(dirname "$0") +RELEASE_DATE=$(date +"%B %d, %Y") + +check_git() { + if [[ -n "$(git status --porcelain --untracked-files=no)" ]]; then + echo "Aborting: dirty working directory." + exit 1 + fi + if [[ "$(git rev-parse --abbrev-ref HEAD)" != "develop" ]]; then + echo "Aborting: not on develop." + exit 1 + fi + echo -n "Are you absolutely ready to release? [yN] " + read confirm + if [[ ${confirm,,} != "y" ]]; then + exit 1 + fi +} + +update_version() { + echo -n "Updating mwparserfromhell.__version__..." + sed -e 's/__version__ = .*/__version__ = "'$VERSION'"/' -i "" mwparserfromhell/__init__.py + echo " done." +} + +update_changelog() { + filename="CHANGELOG" + echo -n "Updating $filename..." + sed -e '1s/.*/v'$VERSION' (released '$RELEASE_DATE'):/' -i "" $filename + echo " done." +} + +update_docs_changelog() { + filename="docs/changelog.rst" + echo -n "Updating $filename..." + dashes=$(seq 1 $(expr ${#VERSION} + 1) | sed 's/.*/-/' | tr -d '\n') + previous_lineno=$(expr $(grep -n -e "^---" $filename | sed '2q;d' | cut -d ':' -f 1) - 1) + previous_version=$(sed $previous_lineno'q;d' $filename) + sed \ + -e '4s/.*/v'$VERSION \ + -e '5s/.*/'$dashes \ + -e '7s/.*/`Released '$RELEASE_DATE' `_/' \ + -e '8s/.*/(`changes `__):/' \ + -i "" $filename + echo " done." +} + +do_git_stuff() { + echo -n "Git: committing, tagging, and merging release..." + git commit -qam "release/$VERSION" + git tag v$VERSION -s -m "version $VERSION" + git checkout -q master + git merge -q --no-ff develop -m "Merge branch 'develop'" + echo -n " pushing..." + git push -q --tags origin master + git checkout -q develop + git push -q origin develop + echo " done." +} + +build_sdist() { + echo -n "Uploading to PyPI..." + python setup.py register sdist upload -s + python setup.py upload_docs + echo " done." +} + +post_release() { + echo + echo "*** Release completed." + echo "*** Update: https://github.com/earwig/mwparserfromhell/releases/tag/v$VERSION" + echo "*** Verify: https://pypi.python.org/pypi/mwparserfromhell" + echo "*** Verify: https://mwparserfromhell.readthedocs.org" + echo "*** Press enter to sanity-check the release." + read +} + +test_release() { + echo + echo "Checking mwparserfromhell v$VERSION..." + echo -n "Creating a virtualenv..." + virtdir="mwparser-test-env" + virtualenv -q $virtdir + cd $virtdir + source bin/activate + echo " done." + echo -n "Installing mwparserfromhell with pip..." + pip -q install mwparserfromhell + echo " done." + echo -n "Checking version..." + reported_version=$(python -c 'print __import__("mwparserfromhell").__version__') + if [[ "$reported_version" != "$VERSION" ]]; then + echo " error." + echo "*** ERROR: mwparserfromhell is reporting its version as $reported_version, not $VERSION!" + deactivate + cd .. + rm -rf $virtdir + exit 1 + else + echo " done." + fi + pip -q uninstall -y mwparserfromhell + echo -n "Downloading mwparserfromhell source tarball and GPG signature..." + curl -sL "https://pypi.python.org/packages/source/m/mwparserfromhell/mwparserfromhell-$VERSION.tar.gz" -o "mwparserfromhell.tar.gz" + curl -sL "https://pypi.python.org/packages/source/m/mwparserfromhell/mwparserfromhell-$VERSION.tar.gz.asc" -o "mwparserfromhell.tar.gz.asc" + echo " done." + echo "Verifying tarball..." + gpg --verify mwparserfromhell.tar.gz.asc + if [[ "$?" != "0" ]]; then + echo "*** ERROR: GPG signature verification failed!" + deactivate + cd .. + rm -rf $virtdir + exit 1 + fi + tar -xf mwparserfromhell.tar.gz + rm mwparserfromhell.tar.gz mwparserfromhell.tar.gz.asc + cd mwparserfromhell-$VERSION + echo "Running unit tests..." + python setup.py -q test + if [[ "$?" != "0" ]]; then + echo "*** ERROR: Unit tests failed!" + deactivate + cd ../.. + rm -rf $virtdir + exit 1 + fi + echo -n "Everything looks good. Cleaning up..." + deactivate + cd ../.. + rm -rf $virtdir + echo " done." +} + +echo "Preparing mwparserfromhell v$VERSION..." +cd "$SCRIPT_DIR/.." + +check_git +update_version +update_changelog +update_docs_changelog +do_git_stuff +post_release +test_release + +echo "All done." +exit 0 diff --git a/setup.py b/setup.py index 5a45902..6dbe783 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ with open("README.rst") as fp: long_docs = fp.read() tokenizer = Extension("mwparserfromhell.parser._tokenizer", - sources = ["mwparserfromhell/parser/tokenizer.c"]) + sources = ["mwparserfromhell/parser/tokenizer.c"], + depends = ["mwparserfromhell/parser/tokenizer.h"]) setup( name = "mwparserfromhell", From d342831af8fd976bee6793fde6c3a781bc9fbb46 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 May 2014 21:53:33 -0400 Subject: [PATCH 004/102] Allow passing skip_style_tags to parse() (fixes #73) --- mwparserfromhell/parser/__init__.py | 2 +- mwparserfromhell/utils.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 093e501..67f6eeb 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -57,7 +57,7 @@ class Parser(object): """Parse *text*, returning a :py:class:`~.Wikicode` object tree. If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be - parsed, but instead be treated as plain text. + parsed, but instead will be treated as plain text. """ tokens = self._tokenizer.tokenize(text, context, skip_style_tags) code = self._builder.build(tokens) diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 486170d..c6fd627 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -33,7 +33,7 @@ from .smart_list import SmartList __all__ = ["parse_anything"] -def parse_anything(value, context=0): +def parse_anything(value, context=0, skip_style_tags=False): """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. This differs from :py:meth:`.Parser.parse` in that we accept more than just @@ -50,6 +50,9 @@ def parse_anything(value, context=0): For example, :py:class:`~.ExternalLink`\ 's :py:attr:`~.ExternalLink.url` setter sets *context* to :py:mod:`contexts.EXT_LINK_URI <.contexts>` to prevent the URL itself from becoming an :py:class:`~.ExternalLink`. + + If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be + parsed, but instead will be treated as plain text. """ from .parser import Parser from .wikicode import Wikicode @@ -59,17 +62,17 @@ def parse_anything(value, context=0): elif isinstance(value, Node): return Wikicode(SmartList([value])) elif isinstance(value, str): - return Parser().parse(value, context) + return Parser().parse(value, context, skip_style_tags) elif isinstance(value, bytes): - return Parser().parse(value.decode("utf8"), context) + return Parser().parse(value.decode("utf8"), context, skip_style_tags) elif isinstance(value, int): - return Parser().parse(str(value), context) + return Parser().parse(str(value), context, skip_style_tags) elif value is None: return Wikicode(SmartList()) try: nodelist = SmartList() for item in value: - nodelist += parse_anything(item, context).nodes + nodelist += parse_anything(item, context, skip_style_tags).nodes except TypeError: error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" raise ValueError(error.format(type(value).__name__, value)) From 9108d49d68e6024c751e723fcf8e118cdee77af0 Mon Sep 17 00:00:00 2001 From: Ricordisamoa Date: Sat, 24 May 2014 03:08:06 +0200 Subject: [PATCH 005/102] =?UTF-8?q?fix=20some=20typos:=20occurance=20?= =?UTF-8?q?=E2=86=92=20occurrence,=20parasable=20=E2=86=92=20parsable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mwparserfromhell/nodes/tag.py | 2 +- mwparserfromhell/nodes/template.py | 6 +++--- mwparserfromhell/wikicode.py | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 661304e..f283d46 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -240,7 +240,7 @@ class Tag(Node): pad_before_eq="", pad_after_eq=""): """Add an attribute with the given *name* and *value*. - *name* and *value* can be anything parasable by + *name* and *value* can be anything parsable by :py:func:`.utils.parse_anything`; *value* can be omitted if the attribute is valueless. *quoted* is a bool telling whether to wrap the *value* in double quotes (this is recommended). *pad_first*, diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index d1a0b0e..3b5b35c 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -95,7 +95,7 @@ class Template(Node): def _select_theory(self, theories): """Return the most likely spacing convention given different options. - Given a dictionary of convention options as keys and their occurance as + Given a dictionary of convention options as keys and their occurrence as values, return the convention that occurs the most, or ``None`` if there is no clear preferred style. """ @@ -208,7 +208,7 @@ class Template(Node): preserve_spacing=True): """Add a parameter to the template with a given *name* and *value*. - *name* and *value* can be anything parasable by + *name* and *value* can be anything parsable by :py:func:`.utils.parse_anything`; pipes and equal signs are automatically escaped from *value* when appropriate. @@ -226,7 +226,7 @@ class Template(Node): name), then we will place the parameter immediately before this one. Otherwise, it will be added at the end. If *before* is a name and exists multiple times in the template, we will place it before the last - occurance. If *before* is not in the template, :py:exc:`ValueError` is + occurrence. If *before* is not in the template, :py:exc:`ValueError` is raised. The argument is ignored if the new parameter already exists. If *preserve_spacing* is ``False``, we will avoid preserving spacing diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 44515a6..f728248 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -294,7 +294,7 @@ class Wikicode(StringMixIn): def insert(self, index, value): """Insert *value* at *index* in the list of nodes. - *value* can be anything parasable by :py:func:`.parse_anything`, which + *value* can be anything parsable by :py:func:`.parse_anything`, which includes strings or other :py:class:`~.Wikicode` or :py:class:`~.Node` objects. """ @@ -309,7 +309,7 @@ class Wikicode(StringMixIn): :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, for example). If *obj* is a string, we will operate on all instances of that string within the code, otherwise only on the specific instance - given. *value* can be anything parasable by :py:func:`.parse_anything`. + given. *value* can be anything parsable by :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to find *obj* within our child nodes even if it is not a direct descendant of this :py:class:`~.Wikicode` object. If *obj* is not found, @@ -333,7 +333,7 @@ class Wikicode(StringMixIn): :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, for example). If *obj* is a string, we will operate on all instances of that string within the code, otherwise only on the specific instance - given. *value* can be anything parasable by :py:func:`.parse_anything`. + given. *value* can be anything parsable by :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to find *obj* within our child nodes even if it is not a direct descendant of this :py:class:`~.Wikicode` object. If *obj* is not found, @@ -357,7 +357,7 @@ class Wikicode(StringMixIn): :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, for example). If *obj* is a string, we will operate on all instances of that string within the code, otherwise only on the specific instance - given. *value* can be anything parasable by :py:func:`.parse_anything`. + given. *value* can be anything parsable by :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to find *obj* within our child nodes even if it is not a direct descendant of this :py:class:`~.Wikicode` object. If *obj* is not found, @@ -380,7 +380,7 @@ class Wikicode(StringMixIn): def append(self, value): """Insert *value* at the end of the list of nodes. - *value* can be anything parasable by :py:func:`.parse_anything`. + *value* can be anything parsable by :py:func:`.parse_anything`. """ nodes = parse_anything(value).nodes for node in nodes: From 0497b54f03072effb42ac81dd9e1480042c03c76 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 May 2014 21:13:26 -0400 Subject: [PATCH 006/102] Fix _handle_single_tag_end()'s token search order (fixes #74) --- mwparserfromhell/parser/tokenizer.c | 2 +- mwparserfromhell/parser/tokenizer.py | 8 +++++--- tests/tokenizer/tags.mwtest | 7 +++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index de58e72..d8a505f 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1899,7 +1899,7 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self) int is_instance; len = PyList_GET_SIZE(self->topstack->stack); - for (index = 0; index < len; index++) { + for (index = len - 1; index >= 0; index--) { token = PyList_GET_ITEM(self->topstack->stack, index); is_instance = PyObject_IsInstance(token, TagCloseOpen); if (is_instance == -1) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 29a7e25..93d53e7 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,6 +21,7 @@ # SOFTWARE. from __future__ import unicode_literals +from itertools import izip from math import log import re @@ -751,11 +752,12 @@ class Tokenizer(object): def _handle_single_tag_end(self): """Handle the stream end when inside a single-supporting HTML tag.""" - gen = enumerate(self._stack) + stack = self._stack + gen = izip(xrange(len(stack) - 1, -1, -1), reversed(stack)) index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen)) - padding = self._stack[index].padding + padding = stack[index].padding token = tokens.TagCloseSelfclose(padding=padding, implicit=True) - self._stack[index] = token + stack[index] = token return self._pop() def _really_parse_tag(self): diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index a8ca2f0..26e569b 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -124,6 +124,13 @@ output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before --- +name: nested_tag_selfclosing +label: a tag nested within the attributes of another; outer tag implicitly self-closing +input: "
  • " +output: [TagOpenOpen(), Text(text="li"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TagOpenOpen(), Text(text="b"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="test" From 0101c038fbc6e9048256d165e488ddf9218a4660 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 May 2014 21:21:47 -0400 Subject: [PATCH 007/102] Python 3, grr. --- mwparserfromhell/compat.py | 2 ++ mwparserfromhell/parser/tokenizer.py | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py index 4384ace..94e0db3 100644 --- a/mwparserfromhell/compat.py +++ b/mwparserfromhell/compat.py @@ -20,6 +20,7 @@ if py3k: range = range maxsize = sys.maxsize import html.entities as htmlentities + zip = zip else: bytes = str @@ -27,5 +28,6 @@ else: range = xrange maxsize = sys.maxint import htmlentitydefs as htmlentities + from itertools import izip as zip del sys diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 93d53e7..33722fa 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,12 +21,11 @@ # SOFTWARE. from __future__ import unicode_literals -from itertools import izip from math import log import re from . import contexts, tokens -from ..compat import htmlentities, range +from ..compat import htmlentities, range, zip from ..definitions import (get_html_tag, is_parsable, is_single, is_single_only, is_scheme) @@ -753,7 +752,7 @@ class Tokenizer(object): def _handle_single_tag_end(self): """Handle the stream end when inside a single-supporting HTML tag.""" stack = self._stack - gen = izip(xrange(len(stack) - 1, -1, -1), reversed(stack)) + gen = zip(range(len(stack) - 1, -1, -1), reversed(stack)) index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen)) padding = stack[index].padding token = tokens.TagCloseSelfclose(padding=padding, implicit=True) From b4b62026f810b4eacfd13c5e503d757b96bcdb8a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 25 May 2014 15:50:12 -0400 Subject: [PATCH 008/102] Update changelog, docs. --- CHANGELOG | 4 ++++ docs/changelog.rst | 4 ++++ mwparserfromhell/parser/__init__.py | 7 +++++++ mwparserfromhell/utils.py | 9 +-------- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 98a1f96..289c413 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,10 @@ v0.4 (unreleased): - Added a script to test for memory leaks in scripts/memtest.py. - Added a script to do releases in scripts/release.sh. +- skip_style_tags can now be passed to mwparserfromhell.parse() (previously, + only Parser().parse() allowed it). +- Fixed a parser bug involving nested tags. +- Updated and fixed some documentation. v0.3.3 (released April 22, 2014): diff --git a/docs/changelog.rst b/docs/changelog.rst index 3ce507e..21f0629 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,10 @@ Unreleased - Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to do releases in :file:`scripts/release.sh`. +- *skip_style_tags* can now be passed to :py:func:`mwparserfromhell.parse() + <.parse_anything>` (previously, only :py:meth:`.Parser.parse` allowed it). +- Fixed a parser bug involving nested tags. +- Updated and fixed some documentation. v0.3.3 ------ diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 67f6eeb..8bac295 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -56,6 +56,13 @@ class Parser(object): def parse(self, text, context=0, skip_style_tags=False): """Parse *text*, returning a :py:class:`~.Wikicode` object tree. + If given, *context* will be passed as a starting context to the parser. + This is helpful when this function is used inside node attribute + setters. For example, :py:class:`~.ExternalLink`\ 's + :py:attr:`~.ExternalLink.url` setter sets *context* to + :py:mod:`contexts.EXT_LINK_URI <.contexts>` to prevent the URL itself + from becoming an :py:class:`~.ExternalLink`. + If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be parsed, but instead will be treated as plain text. """ diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index c6fd627..fd54ad0 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -45,14 +45,7 @@ def parse_anything(value, context=0, skip_style_tags=False): :py:class:`~.Template`, such as :py:meth:`wikicode.insert() <.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`. - If given, *context* will be passed as a starting context to the parser. - This is helpful when this function is used inside node attribute setters. - For example, :py:class:`~.ExternalLink`\ 's :py:attr:`~.ExternalLink.url` - setter sets *context* to :py:mod:`contexts.EXT_LINK_URI <.contexts>` to - prevent the URL itself from becoming an :py:class:`~.ExternalLink`. - - If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be - parsed, but instead will be treated as plain text. + Additional arguments are passed directly to :py:meth:`.Parser.parse`. """ from .parser import Parser from .wikicode import Wikicode From c95802f9cc124cdd8e5b87a733a673dcaf20c2da Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 25 May 2014 22:27:31 -0400 Subject: [PATCH 009/102] Allow recursing through everything except the forced type (fixes #70) --- CHANGELOG | 4 ++++ docs/changelog.rst | 5 ++++ mwparserfromhell/wikicode.py | 56 +++++++++++++++++++++++++++----------------- tests/test_wikicode.py | 21 ++++++++++++----- 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 289c413..7da4968 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,10 @@ v0.4 (unreleased): - Added a script to do releases in scripts/release.sh. - skip_style_tags can now be passed to mwparserfromhell.parse() (previously, only Parser().parse() allowed it). +- The 'recursive' argument to Wikicode's filter methods now accepts a third + option, RECURSE_OTHERS, which recurses over all children except instances of + 'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` + returns all un-nested templates). - Fixed a parser bug involving nested tags. - Updated and fixed some documentation. diff --git a/docs/changelog.rst b/docs/changelog.rst index 21f0629..8416204 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -11,6 +11,11 @@ Unreleased - Added a script to do releases in :file:`scripts/release.sh`. - *skip_style_tags* can now be passed to :py:func:`mwparserfromhell.parse() <.parse_anything>` (previously, only :py:meth:`.Parser.parse` allowed it). +- The *recursive* argument to :py:class:`Wikicode's <.Wikicode>` + :py:meth:`.filter` methods now accepts a third option, ``RECURSE_OTHERS``, + which recurses over all children except instances of *forcetype* (for + example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested + templates). - Fixed a parser bug involving nested tags. - Updated and fixed some documentation. diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index f728248..d7736ff 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -44,6 +44,7 @@ class Wikicode(StringMixIn): ` series of functions is very useful for extracting and iterating over, for example, all of the templates in the object. """ + RECURSE_OTHERS = 2 def __init__(self, nodes): super(Wikicode, self).__init__() @@ -53,12 +54,15 @@ class Wikicode(StringMixIn): return "".join([str(node) for node in self.nodes]) @staticmethod - def _get_children(node, contexts=False, parent=None): + def _get_children(node, contexts=False, restrict=None, parent=None): """Iterate over all child :py:class:`.Node`\ s of a given *node*.""" yield (parent, node) if contexts else node + if restrict and isinstance(node, restrict): + return for code in node.__children__(): for child in code.nodes: - for result in Wikicode._get_children(child, contexts, code): + sub = Wikicode._get_children(child, contexts, restrict, code) + for result in sub: yield result @staticmethod @@ -79,7 +83,7 @@ class Wikicode(StringMixIn): if matches: if callable(matches): return matches - return lambda obj: re.search(matches, str(obj), flags) # r + return lambda obj: re.search(matches, str(obj), flags) return lambda obj: True def _indexed_ifilter(self, recursive=True, matches=None, flags=FLAGS, @@ -93,8 +97,9 @@ class Wikicode(StringMixIn): """ match = self._build_matcher(matches, flags) if recursive: + restrict = forcetype if recursive == self.RECURSE_OTHERS else None def getter(i, node): - for ch in self._get_children(node): + for ch in self._get_children(node, restrict=restrict): yield (i, ch) inodes = chain(*(getter(i, n) for i, n in enumerate(self.nodes))) else: @@ -222,10 +227,10 @@ class Wikicode(StringMixIn): This is equivalent to :py:meth:`{1}` with *forcetype* set to :py:class:`~{2.__module__}.{2.__name__}`. """ - make_ifilter = lambda ftype: (lambda self, **kw: - self.ifilter(forcetype=ftype, **kw)) - make_filter = lambda ftype: (lambda self, **kw: - self.filter(forcetype=ftype, **kw)) + make_ifilter = lambda ftype: (lambda self, *a, **kw: + self.ifilter(forcetype=ftype, *a, **kw)) + make_filter = lambda ftype: (lambda self, *a, **kw: + self.filter(forcetype=ftype, *a, **kw)) for name, ftype in (meths.items() if py3k else meths.iteritems()): ifilter = make_ifilter(ftype) filter = make_filter(ftype) @@ -435,27 +440,36 @@ class Wikicode(StringMixIn): forcetype=None): """Iterate over nodes in our list matching certain conditions. - If *recursive* is ``True``, we will iterate over our children and all - of their descendants, otherwise just our immediate children. If - *forcetype* is given, only nodes that are instances of this type are - yielded. *matches* can be used to further restrict the nodes, either as - a function (taking a single :py:class:`.Node` and returning a boolean) - or a regular expression (matched against the node's string - representation with :py:func:`re.search`). If *matches* is a regex, the - flags passed to :py:func:`re.search` are :py:const:`re.IGNORECASE`, + If *forcetype* is given, only nodes that are instances of this type (or + tuple of types) are yielded. Setting *recursive* to ``True`` will + iterate over all children and their descendants. ``RECURSE_OTHERS`` + will only iterate over children that are not the instances of + *forcetype*. ``False`` will only iterate over immediate children. + + ``RECURSE_OTHERS`` can be used to iterate over all un-nested templates, + even if they are inside of HTML tags, like so: + + >>> code = mwparserfromhell.parse("{{foo}}{{foo|{{bar}}}}") + >>> code.filter_templates(code.RECURSE_OTHERS) + ["{{foo}}", "{{foo|{{bar}}}}"] + + *matches* can be used to further restrict the nodes, either as a + function (taking a single :py:class:`.Node` and returning a boolean) or + a regular expression (matched against the node's string representation + with :py:func:`re.search`). If *matches* is a regex, the flags passed + to :py:func:`re.search` are :py:const:`re.IGNORECASE`, :py:const:`re.DOTALL`, and :py:const:`re.UNICODE`, but custom flags can be specified by passing *flags*. """ - return (node for i, node in - self._indexed_ifilter(recursive, matches, flags, forcetype)) + gen = self._indexed_ifilter(recursive, matches, flags, forcetype) + return (node for i, node in gen) - def filter(self, recursive=True, matches=None, flags=FLAGS, - forcetype=None): + def filter(self, *args, **kwargs): """Return a list of nodes within our list matching certain conditions. This is equivalent to calling :py:func:`list` on :py:meth:`ifilter`. """ - return list(self.ifilter(recursive, matches, flags, forcetype)) + return list(self.ifilter(*args, **kwargs)) def get_sections(self, levels=None, matches=None, flags=FLAGS, flat=False, include_lead=None, include_headings=True): diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 9ff5949..a7c3eb3 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -319,11 +319,14 @@ class TestWikicode(TreeEqualityTestCase): self.assertEqual(["{{baz}}", "{{bz}}"], func(matches=r"^{{b.*?z")) self.assertEqual(["{{baz}}"], func(matches=r"^{{b.+?z}}")) - self.assertEqual(["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"], - code2.filter_templates(recursive=False)) - self.assertEqual(["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}", "{{b}}", - "{{c|d={{f}}{{h}}}}", "{{f}}", "{{h}}"], - code2.filter_templates(recursive=True)) + exp_rec = ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}", "{{b}}", + "{{c|d={{f}}{{h}}}}", "{{f}}", "{{h}}"] + exp_unrec = ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"] + self.assertEqual(exp_rec, code2.filter_templates()) + self.assertEqual(exp_unrec, code2.filter_templates(recursive=False)) + self.assertEqual(exp_rec, code2.filter_templates(recursive=True)) + self.assertEqual(exp_rec, code2.filter_templates(True)) + self.assertEqual(exp_unrec, code2.filter_templates(False)) self.assertEqual(["{{foobar}}"], code3.filter_templates( matches=lambda node: node.name.matches("Foobar"))) @@ -332,9 +335,15 @@ class TestWikicode(TreeEqualityTestCase): self.assertEqual([], code3.filter_tags(matches=r"^{{b.*?z")) self.assertEqual([], code3.filter_tags(matches=r"^{{b.*?z", flags=0)) - self.assertRaises(TypeError, code.filter_templates, 100) self.assertRaises(TypeError, code.filter_templates, a=42) self.assertRaises(TypeError, code.filter_templates, forcetype=Template) + self.assertRaises(TypeError, code.filter_templates, 1, 0, 0, Template) + + code4 = parse("{{foo}}{{foo|{{bar}}}}") + actual1 = code4.filter_templates(recursive=code4.RECURSE_OTHERS) + actual2 = code4.filter_templates(code4.RECURSE_OTHERS) + self.assertEqual(["{{foo}}", "{{foo|{{bar}}}}"], actual1) + self.assertEqual(["{{foo}}", "{{foo|{{bar}}}}"], actual2) def test_get_sections(self): """test Wikicode.get_sections()""" From 51df09ccf0b0b6c0c0cb6d47e64f3937437f8bc5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 31 May 2014 21:02:16 -0400 Subject: [PATCH 010/102] Really minor documentation fixes. --- README.rst | 6 ++++-- docs/integration.rst | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 5b4cfe1..1b25c42 100644 --- a/README.rst +++ b/README.rst @@ -123,19 +123,21 @@ If you're using Pywikipedia_, your code might look like this:: import mwparserfromhell import wikipedia as pywikibot + def parse(title): site = pywikibot.getSite() page = pywikibot.Page(site, title) text = page.get() return mwparserfromhell.parse(text) -If you're not using a library, you can parse templates in any page using the -following code (via the API_):: +If you're not using a library, you can parse any page using the following code +(via the API_):: import json import urllib import mwparserfromhell API_URL = "http://en.wikipedia.org/w/api.php" + def parse(title): data = {"action": "query", "prop": "revisions", "rvlimit": 1, "rvprop": "content", "format": "json", "titles": title} diff --git a/docs/integration.rst b/docs/integration.rst index 78810b8..a09334d 100644 --- a/docs/integration.rst +++ b/docs/integration.rst @@ -11,19 +11,21 @@ If you're using Pywikipedia_, your code might look like this:: import mwparserfromhell import wikipedia as pywikibot + def parse(title): site = pywikibot.getSite() page = pywikibot.Page(site, title) text = page.get() return mwparserfromhell.parse(text) -If you're not using a library, you can parse templates in any page using the -following code (via the API_):: +If you're not using a library, you can parse any page using the following code +(via the API_):: import json import urllib import mwparserfromhell API_URL = "http://en.wikipedia.org/w/api.php" + def parse(title): raw = urllib.urlopen(API_URL, data).read() res = json.loads(raw) From 34a6c7cc4c8ee6bc9c2b5628a2e81ec2f971884c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jun 2014 22:20:28 -0400 Subject: [PATCH 011/102] Typo fix. --- mwparserfromhell/parser/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 33722fa..aa7499a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -112,7 +112,7 @@ class Tokenizer(object): self._textbuffer = [] def _pop(self, keep_context=False): - """Pop the current stack/context/textbuffer, returing the stack. + """Pop the current stack/context/textbuffer, returning the stack. If *keep_context* is ``True``, then we will replace the underlying stack's context with the current stack's. From 02eff0fc490d6f46309a96d24e338f4ee69b8381 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 23 Jun 2014 23:32:47 -0400 Subject: [PATCH 012/102] Fully fix #74. Add another tokenizer test. --- mwparserfromhell/compat.py | 2 -- mwparserfromhell/parser/tokenizer.c | 18 +++++++++++++----- mwparserfromhell/parser/tokenizer.py | 17 ++++++++++++----- tests/tokenizer/integration.mwtest | 7 +++++++ 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py index 94e0db3..4384ace 100644 --- a/mwparserfromhell/compat.py +++ b/mwparserfromhell/compat.py @@ -20,7 +20,6 @@ if py3k: range = range maxsize = sys.maxsize import html.entities as htmlentities - zip = zip else: bytes = str @@ -28,6 +27,5 @@ else: range = xrange maxsize = sys.maxint import htmlentitydefs as htmlentities - from itertools import izip as zip del sys diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index d8a505f..41ce5ac 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1896,18 +1896,26 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self) { PyObject *token = 0, *padding, *kwargs; Py_ssize_t len, index; - int is_instance; + int depth = 1, is_instance; len = PyList_GET_SIZE(self->topstack->stack); - for (index = len - 1; index >= 0; index--) { + for (index = 2; index < len; index++) { token = PyList_GET_ITEM(self->topstack->stack, index); - is_instance = PyObject_IsInstance(token, TagCloseOpen); + is_instance = PyObject_IsInstance(token, TagOpenOpen); if (is_instance == -1) return NULL; else if (is_instance == 1) - break; + depth++; + is_instance = PyObject_IsInstance(token, TagCloseOpen); + if (is_instance == -1) + return NULL; + else if (is_instance == 1) { + depth--; + if (depth == 0) + break; + } } - if (!token) + if (!token || depth > 0) return NULL; padding = PyObject_GetAttrString(token, "padding"); if (!padding) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index aa7499a..e69a823 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -25,7 +25,7 @@ from math import log import re from . import contexts, tokens -from ..compat import htmlentities, range, zip +from ..compat import htmlentities, range from ..definitions import (get_html_tag, is_parsable, is_single, is_single_only, is_scheme) @@ -752,11 +752,18 @@ class Tokenizer(object): def _handle_single_tag_end(self): """Handle the stream end when inside a single-supporting HTML tag.""" stack = self._stack - gen = zip(range(len(stack) - 1, -1, -1), reversed(stack)) - index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen)) + # We need to find the index of the TagCloseOpen token corresponding to + # the TagOpenOpen token located at index 0: + depth = 1 + for index, token in enumerate(stack[2:], 2): + if isinstance(token, tokens.TagOpenOpen): + depth += 1 + elif isinstance(token, tokens.TagCloseOpen): + depth -= 1 + if depth == 0: + break padding = stack[index].padding - token = tokens.TagCloseSelfclose(padding=padding, implicit=True) - stack[index] = token + stack[index] = tokens.TagCloseSelfclose(padding=padding, implicit=True) return self._pop() def _really_parse_tag(self): diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index bf19f4d..5e1a409 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -178,3 +178,10 @@ name: external_link_inside_wikilink_title label: an external link inside a wikilink title, which is invalid input: "[[File:Example.png http://example.com]]" output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()] + +--- + +name: italics_inside_external_link_inside_incomplete_list +label: italic text inside an external link inside an incomplete list +input: "
  • [http://www.example.com ''example'']" +output: [TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding="", implicit=True), ExternalLinkOpen(brackets=True), Text(text="http://www.example.com"), ExternalLinkSeparator(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="example"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()] From 9412579d862451e2b8d14f0010f16df7ecce61f5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 27 Jun 2014 15:17:48 -0400 Subject: [PATCH 013/102] Remove unnecessary unicode_literals. --- mwparserfromhell/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index e7459e3..9c29fd2 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -26,8 +26,6 @@ Parser from Hell) is a Python package that provides an easy-to-use and outrageously powerful parser for `MediaWiki `_ wikicode. """ -from __future__ import unicode_literals - __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014 Ben Kurtovic" __license__ = "MIT License" From 3dd29097e4d436f4fa7a01e2c4213c528168b242 Mon Sep 17 00:00:00 2001 From: Merlijn van Deen Date: Fri, 27 Jun 2014 15:21:16 -0700 Subject: [PATCH 014/102] _test_tokenizer: force utf-8 file encoding --- tests/_test_tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index 7487241..313b959 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -25,6 +25,8 @@ from os import listdir, path import sys from mwparserfromhell.compat import py3k +if not py3k: + from codecs import open from mwparserfromhell.parser import tokens class _TestParseError(Exception): @@ -109,10 +111,8 @@ class TokenizerTestCase(object): def build(cls): """Load and install all tests from the 'tokenizer' directory.""" def load_file(filename): - with open(filename, "rU") as fp: + with open(filename, "rU", encoding='utf8') as fp: text = fp.read() - if not py3k: - text = text.decode("utf8") name = path.split(filename)[1][:0-len(extension)] cls._load_tests(filename, name, text) From b135e8e473837909c6847f8a52711527409b5224 Mon Sep 17 00:00:00 2001 From: Merlijn van Deen Date: Fri, 27 Jun 2014 15:21:37 -0700 Subject: [PATCH 015/102] Add windows build tools --- tools/build_mwpfh.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 tools/build_mwpfh.py diff --git a/tools/build_mwpfh.py b/tools/build_mwpfh.py new file mode 100644 index 0000000..a090b4d --- /dev/null +++ b/tools/build_mwpfh.py @@ -0,0 +1,43 @@ +from __future__ import print_function + +import subprocess +import sys +import os + +path = os.path.split(__file__)[0] +if path: + os.chdir(path) + +environments = ['26', '27', '32', '33', '34'] + +target = "pypi" if "--push" in sys.argv else "test" + +returnvalues = {} + +def run(pyver, cmds, target=None): + cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds + if target: + cmd += ["-r", target] + + print(" ".join(cmd), end=" ") + retval = subprocess.call(cmd, stdout=open("%s%s.log" % (cmds[0], pyver), 'w'), stderr=subprocess.STDOUT, cwd="..") + if not retval: + print("[OK]") + else: + print("[FAILED (%i)]" % retval) + return retval + +run("27", ["register"], target) + +if 'failed' in open('register27.log').read(): + raise Exception + +for pyver in environments: + print() + try: + os.unlink('mwparserfromhell/parser/_tokenizer.pyd') + except WindowsError: + pass + + if run(pyver, ["test"]) == 0: + run(pyver, ["bdist_wheel", "upload"], target) \ No newline at end of file From 05d048762f2e0c81a9e425425269dcdde4bec251 Mon Sep 17 00:00:00 2001 From: Merlijn van Deen Date: Fri, 27 Jun 2014 15:21:45 -0700 Subject: [PATCH 016/102] Improve .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 4068716..8790182 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc +*.pyd *.so *.dll *.egg @@ -8,3 +9,4 @@ __pycache__ build dist docs/_build +tools/*.log From 581ca9a2213d6329a45d3b927873febe9e5ad479 Mon Sep 17 00:00:00 2001 From: Merlijn van Deen Date: Sat, 28 Jun 2014 00:35:50 +0200 Subject: [PATCH 017/102] Update README.rst for the new Windows wheels --- README.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 5b4cfe1..d0f67c9 100644 --- a/README.rst +++ b/README.rst @@ -17,7 +17,10 @@ Installation The easiest way to install the parser is through the `Python Package Index`_, so you can install the latest release with ``pip install mwparserfromhell`` -(`get pip`_). Alternatively, get the latest development version:: +(`get pip`_). On Windows, make sure you have the latest version of pip +installed by running `pip install --upgrade pip`. + +Alternatively, get the latest development version:: git clone https://github.com/earwig/mwparserfromhell.git cd mwparserfromhell From 5e9930b8a060d2ad80713809e312eae913ce7a4f Mon Sep 17 00:00:00 2001 From: Merlijn van Deen Date: Sat, 28 Jun 2014 00:37:31 +0200 Subject: [PATCH 018/102] Fix tabs in update_mwpfh.py --- tools/build_mwpfh.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tools/build_mwpfh.py b/tools/build_mwpfh.py index a090b4d..4a86241 100644 --- a/tools/build_mwpfh.py +++ b/tools/build_mwpfh.py @@ -6,7 +6,7 @@ import os path = os.path.split(__file__)[0] if path: - os.chdir(path) + os.chdir(path) environments = ['26', '27', '32', '33', '34'] @@ -15,29 +15,29 @@ target = "pypi" if "--push" in sys.argv else "test" returnvalues = {} def run(pyver, cmds, target=None): - cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds - if target: - cmd += ["-r", target] - - print(" ".join(cmd), end=" ") - retval = subprocess.call(cmd, stdout=open("%s%s.log" % (cmds[0], pyver), 'w'), stderr=subprocess.STDOUT, cwd="..") - if not retval: - print("[OK]") - else: - print("[FAILED (%i)]" % retval) - return retval +cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds +if target: + cmd += ["-r", target] + +print(" ".join(cmd), end=" ") +retval = subprocess.call(cmd, stdout=open("%s%s.log" % (cmds[0], pyver), 'w'), stderr=subprocess.STDOUT, cwd="..") +if not retval: + print("[OK]") +else: + print("[FAILED (%i)]" % retval) +return retval run("27", ["register"], target) if 'failed' in open('register27.log').read(): - raise Exception + raise Exception for pyver in environments: - print() - try: - os.unlink('mwparserfromhell/parser/_tokenizer.pyd') - except WindowsError: - pass - - if run(pyver, ["test"]) == 0: - run(pyver, ["bdist_wheel", "upload"], target) \ No newline at end of file + print() + try: + os.unlink('mwparserfromhell/parser/_tokenizer.pyd') + except WindowsError: + pass + + if run(pyver, ["test"]) == 0: + run(pyver, ["bdist_wheel", "upload"], target) From fb16781659080a7a38888e4579430192e66347cb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 27 Jun 2014 18:49:44 -0400 Subject: [PATCH 019/102] 3.4 should work on Travis now. --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 31090f2..5fe3760 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,5 +4,6 @@ python: - "2.7" - "3.2" - "3.3" + - "3.4" install: python setup.py build script: python setup.py test -q From efcd59e0972e2957d70c23ac3d071d82bfa4d88d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 27 Jun 2014 18:51:22 -0400 Subject: [PATCH 020/102] Put -q earlier so Travis generates fewer messages. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 5fe3760..de041fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,4 +6,4 @@ python: - "3.3" - "3.4" install: python setup.py build -script: python setup.py test -q +script: python setup.py -q test From 9b207dc7e232f4599f4d30a66fea2e4510d0f825 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 27 Jun 2014 18:52:24 -0400 Subject: [PATCH 021/102] Use the newer SVG. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 1b25c42..755bcbd 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,7 @@ mwparserfromhell ================ -.. image:: https://travis-ci.org/earwig/mwparserfromhell.png?branch=develop +.. image:: https://api.travis-ci.org/earwig/mwparserfromhell.svg?branch=develop :alt: Build Status :target: http://travis-ci.org/earwig/mwparserfromhell From d8adb62454f464f39b59c179f52ddb17621a2e18 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 27 Jun 2014 19:56:18 -0400 Subject: [PATCH 022/102] Some tweaks, but no change in functionality. --- .gitignore | 2 +- README.rst | 11 +++-------- scripts/win_build.py | 36 ++++++++++++++++++++++++++++++++++++ setup.py | 4 ++-- tests/_test_tokenizer.py | 5 ++--- tools/build_mwpfh.py | 43 ------------------------------------------- 6 files changed, 44 insertions(+), 57 deletions(-) create mode 100644 scripts/win_build.py delete mode 100644 tools/build_mwpfh.py diff --git a/.gitignore b/.gitignore index 8790182..f7f7bd9 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ __pycache__ build dist docs/_build -tools/*.log +scripts/*.log diff --git a/README.rst b/README.rst index d0f67c9..c112afd 100644 --- a/README.rst +++ b/README.rst @@ -15,10 +15,10 @@ Full documentation is available on ReadTheDocs_. Development occurs on GitHub_. Installation ------------ -The easiest way to install the parser is through the `Python Package Index`_, -so you can install the latest release with ``pip install mwparserfromhell`` +The easiest way to install the parser is through the `Python Package Index`_; +you can install the latest release with ``pip install mwparserfromhell`` (`get pip`_). On Windows, make sure you have the latest version of pip -installed by running `pip install --upgrade pip`. +installed by running ``pip install --upgrade pip``. Alternatively, get the latest development version:: @@ -26,11 +26,6 @@ Alternatively, get the latest development version:: cd mwparserfromhell python setup.py install -If you get ``error: Unable to find vcvarsall.bat`` while installing, this is -because Windows can't find the compiler for C extensions. Consult this -`StackOverflow question`_ for help. You can also set ``ext_modules`` in -``setup.py`` to an empty list to prevent the extension from building. - You can run the comprehensive unit testing suite with ``python setup.py test -q``. diff --git a/scripts/win_build.py b/scripts/win_build.py new file mode 100644 index 0000000..c70dedc --- /dev/null +++ b/scripts/win_build.py @@ -0,0 +1,36 @@ +from __future__ import print_function +import os +from subprocess import call, STDOUT + +ENVIRONMENTS = ["26", "27", "32", "33", "34"] + +def run(pyver, cmds): + cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds + print(" ".join(cmd), end=" ") + + with open("%s%s.log" % (cmds[0], pyver), "w") as logfile: + retval = call(cmd, stdout=logfile, stderr=STDOUT, cwd="..") + if not retval: + print("[OK]") + else: + print("[FAILED (%i)]" % retval) + return retval + +def main(): + path = os.path.split(__file__)[0] + if path: + os.chdir(path) + + print("Building Windows wheels for Python %s:" % ", ".join(ENVIRONMENTS)) + for pyver in ENVIRONMENTS: + print() + try: + os.unlink("mwparserfromhell/parser/_tokenizer.pyd") + except OSError: + pass + + if run(pyver, ["test"]) == 0: + run(pyver, ["bdist_wheel", "upload"]) + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 5a45902..07fb330 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ import sys if (sys.version_info[0] == 2 and sys.version_info[1] < 6) or \ (sys.version_info[1] == 3 and sys.version_info[1] < 2): - raise Exception('mwparserfromhell needs Python 2.6+ or 3.2+') + raise Exception("mwparserfromhell needs Python 2.6+ or 3.2+") from setuptools import setup, find_packages, Extension @@ -36,7 +36,7 @@ with open("README.rst") as fp: long_docs = fp.read() tokenizer = Extension("mwparserfromhell.parser._tokenizer", - sources = ["mwparserfromhell/parser/tokenizer.c"]) + sources=["mwparserfromhell/parser/tokenizer.c"]) setup( name = "mwparserfromhell", diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index 313b959..bfd4857 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -21,12 +21,11 @@ # SOFTWARE. from __future__ import print_function, unicode_literals +import codecs from os import listdir, path import sys from mwparserfromhell.compat import py3k -if not py3k: - from codecs import open from mwparserfromhell.parser import tokens class _TestParseError(Exception): @@ -111,7 +110,7 @@ class TokenizerTestCase(object): def build(cls): """Load and install all tests from the 'tokenizer' directory.""" def load_file(filename): - with open(filename, "rU", encoding='utf8') as fp: + with codecs.open(filename, "rU", encoding="utf8") as fp: text = fp.read() name = path.split(filename)[1][:0-len(extension)] cls._load_tests(filename, name, text) diff --git a/tools/build_mwpfh.py b/tools/build_mwpfh.py deleted file mode 100644 index 4a86241..0000000 --- a/tools/build_mwpfh.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import print_function - -import subprocess -import sys -import os - -path = os.path.split(__file__)[0] -if path: - os.chdir(path) - -environments = ['26', '27', '32', '33', '34'] - -target = "pypi" if "--push" in sys.argv else "test" - -returnvalues = {} - -def run(pyver, cmds, target=None): -cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds -if target: - cmd += ["-r", target] - -print(" ".join(cmd), end=" ") -retval = subprocess.call(cmd, stdout=open("%s%s.log" % (cmds[0], pyver), 'w'), stderr=subprocess.STDOUT, cwd="..") -if not retval: - print("[OK]") -else: - print("[FAILED (%i)]" % retval) -return retval - -run("27", ["register"], target) - -if 'failed' in open('register27.log').read(): - raise Exception - -for pyver in environments: - print() - try: - os.unlink('mwparserfromhell/parser/_tokenizer.pyd') - except WindowsError: - pass - - if run(pyver, ["test"]) == 0: - run(pyver, ["bdist_wheel", "upload"], target) From 38e423b1407c97aec3b1495902a6ab9ef517e17b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 27 Jun 2014 20:40:36 -0400 Subject: [PATCH 023/102] Update release script. --- scripts/release.sh | 14 ++++++++++++-- scripts/win_build.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/release.sh b/scripts/release.sh index c10871d..4becf1a 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -66,13 +66,21 @@ do_git_stuff() { echo " done." } -build_sdist() { - echo -n "Uploading to PyPI..." +upload_to_pypi() { + # TODO: check whether these commands give output + echo -n "PyPI: uploading source tarball and docs..." python setup.py register sdist upload -s python setup.py upload_docs echo " done." } +windows_build() { + echo "PyPI: building/uploading Windows binaries..." + echo "*** Run in Windows: ./scripts/win_build.py" + echo "*** Press enter when done." + read +} + post_release() { echo echo "*** Release completed." @@ -148,6 +156,8 @@ update_version update_changelog update_docs_changelog do_git_stuff +upload_to_pypi +windows_build post_release test_release diff --git a/scripts/win_build.py b/scripts/win_build.py index c70dedc..143f060 100644 --- a/scripts/win_build.py +++ b/scripts/win_build.py @@ -30,7 +30,7 @@ def main(): pass if run(pyver, ["test"]) == 0: - run(pyver, ["bdist_wheel", "upload"]) + run(pyver, ["bdist_wheel", "upload"]) # TODO: add "-s" to GPG sign if __name__ == "__main__": main() From ad03f60140e011dc1f47d3813693e6f4cea604d8 Mon Sep 17 00:00:00 2001 From: Merlijn van Deen Date: Sat, 28 Jun 2014 11:11:11 +0200 Subject: [PATCH 024/102] Add build requirements --- scripts/win_build.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/win_build.py b/scripts/win_build.py index 143f060..2d51909 100644 --- a/scripts/win_build.py +++ b/scripts/win_build.py @@ -1,3 +1,25 @@ +# Build requirements: +# +# Python 2.6-3.2: Visual C++ Express Edition 2008: +# http://go.microsoft.com/?linkid=7729279 +# +# Python 3.3+: Visual C++ Express Edition 2010: +# http://go.microsoft.com/?linkid=9709949 +# +# x64 builds: Microsoft Windows SDK for Windows 7 and .NET Framework 3.5 SP1: +# http://www.microsoft.com/en-us/download/details.aspx?id=3138 +# +# Python interpreter, 2.6, 2.7, 3.2-3.4: +# https://www.python.org/downloads/ +# +# Pip, setuptools, wheel: +# https://bootstrap.pypa.io/get-pip.py +# and run *for each* Python version: +# c:\pythonXX\python get-pip.py +# c:\pythonXX\scripts\pip install wheel +# +# Afterwards, run this script with any of the python interpreters (2.7 suggested) + from __future__ import print_function import os from subprocess import call, STDOUT From 25e7e7da700d926fdbfc55e4e44dafa29f330dc6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 3 Jul 2014 16:59:49 -0400 Subject: [PATCH 025/102] Test coveralls. --- .travis.yml | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index de041fa..c8dbb88 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,14 @@ language: python python: - - "2.6" - - "2.7" - - "3.2" - - "3.3" - - "3.4" -install: python setup.py build -script: python setup.py -q test + - 2.6 + - 2.7 + - 3.2 + - 3.3 + - 3.4 +install: + - pip install coveralls + - python setup.py build +script: + - coverage run --source=mwparserfromhell setup.py -q test +after_success: + - coveralls From 50515f3e7729b23c0f27fcb13b2d50244b93f4bd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 3 Jul 2014 17:21:00 -0400 Subject: [PATCH 026/102] README badge thing. --- README.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 572afba..b6bf7e7 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,14 @@ mwparserfromhell ================ -.. image:: https://api.travis-ci.org/earwig/mwparserfromhell.svg?branch=develop +.. image:: https://img.shields.io/travis/earwig/mwparserfromhell/develop.svg :alt: Build Status :target: http://travis-ci.org/earwig/mwparserfromhell +.. image:: https://img.shields.io/coveralls/earwig/mwparserfromhell/develop.svg + :alt: Coverage Status + :target: https://coveralls.io/r/earwig/mwparserfromhell + **mwparserfromhell** (the *MediaWiki Parser from Hell*) is a Python package that provides an easy-to-use and outrageously powerful parser for MediaWiki_ wikicode. It supports Python 2 and Python 3. From 08cafc05766afe98c612aca21caa1882e5c2e5c7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 4 Jul 2014 16:57:00 -0400 Subject: [PATCH 027/102] Raise ParserError for internal problems. Improve coverage. Cleanup. --- .coveragerc | 4 +++ .gitignore | 2 ++ CHANGELOG | 3 ++ docs/changelog.rst | 3 ++ mwparserfromhell/nodes/__init__.py | 4 +-- mwparserfromhell/parser/__init__.py | 31 ++++++++++++++++++--- mwparserfromhell/parser/builder.py | 23 +++++++--------- mwparserfromhell/parser/tokenizer.c | 53 ++++++++++++++++++++++++++++++------ mwparserfromhell/parser/tokenizer.h | 3 ++ mwparserfromhell/parser/tokenizer.py | 7 +++-- mwparserfromhell/parser/tokens.py | 2 +- mwparserfromhell/utils.py | 2 +- tests/test_builder.py | 8 +++++- 13 files changed, 113 insertions(+), 32 deletions(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..0a92f19 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[report] +exclude_lines = + pragma: no cover + raise NotImplementedError() diff --git a/.gitignore b/.gitignore index f7f7bd9..3da2db3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,9 +4,11 @@ *.dll *.egg *.egg-info +.coverage .DS_Store __pycache__ build dist docs/_build scripts/*.log +htmlcov/ diff --git a/CHANGELOG b/CHANGELOG index 4f4f77b..d733cee 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -10,6 +10,9 @@ v0.4 (unreleased): option, RECURSE_OTHERS, which recurses over all children except instances of 'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` returns all un-nested templates). +- If something goes wrong while parsing, ParserError will now be raised. + Previously, the parser would produce an unclear BadRoute exception or allow + an incorrect node tree to be build. - Fixed a parser bug involving nested tags. - Updated and fixed some documentation. diff --git a/docs/changelog.rst b/docs/changelog.rst index 0576d29..a530733 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -18,6 +18,9 @@ Unreleased which recurses over all children except instances of *forcetype* (for example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested templates). +- If something goes wrong while parsing, :py:exc:`.ParserError` will now be + raised. Previously, the parser would produce an unclear :py:exc:`.BadRoute` + exception or allow an incorrect node tree to be build. - Fixed a parser bug involving nested tags. - Updated and fixed some documentation. diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index 223cc67..d6f60bd 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -55,8 +55,8 @@ class Node(StringMixIn): raise NotImplementedError() def __children__(self): - return # Funny generator-that-yields-nothing syntax - yield + return + yield # pragma: no cover (this is a generator that yields nothing) def __strip__(self, normalize, collapse): return None diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 8bac295..467d5df 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -26,6 +26,19 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module joins them together under one interface. """ +class ParserError(Exception): + """Exception raised when an internal error occurs while parsing. + + This does not mean that the wikicode was invalid, because invalid markup + should still be parsed correctly. This means that the parser caught itself + with an impossible internal state and is bailing out before other problems + can happen. Its appearance indicates a bug. + """ + def __init__(self, extra): + msg = "This is a bug and should be reported. Info: {0}.".format(extra) + super(ParserError, self).__init__(msg) + + from .builder import Builder from .tokenizer import Tokenizer try: @@ -35,15 +48,22 @@ except ImportError: CTokenizer = None use_c = False -__all__ = ["use_c", "Parser"] +__all__ = ["use_c", "Parser", "ParserError"] class Parser(object): """Represents a parser for wikicode. Actual parsing is a two-step process: first, the text is split up into a - series of tokens by the :py:class:`~.Tokenizer`, and then the tokens are - converted into trees of :py:class:`~.Wikicode` objects and - :py:class:`~.Node`\ s by the :py:class:`~.Builder`. + series of tokens by the :py:class:`.Tokenizer`, and then the tokens are + converted into trees of :py:class:`.Wikicode` objects and + :py:class:`.Node`\ s by the :py:class:`.Builder`. + + Instances of this class or its dependents (:py:class:`.Tokenizer` and + :py:class:`.Builder`) should not be shared between threads. + :py:meth:`parse` can be called multiple times as long as it is not done + concurrently. In general, there is no need to do this because parsing + should be done through :py:func:`mwparserfromhell.parse`, which creates a + new :py:class:`.Parser` object as necessary. """ def __init__(self): @@ -65,6 +85,9 @@ class Parser(object): If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be parsed, but instead will be treated as plain text. + + If there is an internal error while parsing, :py:exc:`.ParserError` + will be raised. """ tokens = self._tokenizer.tokenize(text, context, skip_style_tags) code = self._builder.build(tokens) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 5f8ce45..559bd54 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -22,7 +22,7 @@ from __future__ import unicode_literals -from . import tokens +from . import tokens, ParserError from ..compat import str from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, Template, Text, Wikilink) @@ -33,33 +33,28 @@ from ..wikicode import Wikicode __all__ = ["Builder"] class Builder(object): - """Combines a sequence of tokens into a tree of ``Wikicode`` objects. + """Builds a tree of nodes out of a sequence of tokens. To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build` method. The list will be exhausted as it is parsed and a - :py:class:`~.Wikicode` object will be returned. + :py:class:`.Wikicode` object containing the node tree will be returned. """ def __init__(self): self._tokens = [] self._stacks = [] - def _wrap(self, nodes): - """Properly wrap a list of nodes in a ``Wikicode`` object.""" - return Wikicode(SmartList(nodes)) - def _push(self): """Push a new node list onto the stack.""" self._stacks.append([]) - def _pop(self, wrap=True): + def _pop(self): """Pop the current node list off of the stack. - If *wrap* is ``True``, we will call :py:meth:`_wrap` on the list. + The raw node list is wrapped in a :py:class:`.SmartList` and then in a + :py:class:`.Wikicode` object. """ - if wrap: - return self._wrap(self._stacks.pop()) - return self._stacks.pop() + return Wikicode(SmartList(self._stacks.pop())) def _write(self, item): """Append a node to the current node list.""" @@ -84,7 +79,7 @@ class Builder(object): self._tokens.append(token) value = self._pop() if key is None: - key = self._wrap([Text(str(default))]) + key = Wikicode(SmartList([Text(str(default))])) return Parameter(key, value, showkey) else: self._write(self._handle_token(token)) @@ -270,6 +265,8 @@ class Builder(object): return self._handle_comment() elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) + err = "_handle_token() got unexpected {0}".format(type(token).__name__) + raise ParserError(err) def build(self, tokenlist): """Build a Wikicode object from a list tokens and return it.""" diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 41ce5ac..6ab8570 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -347,7 +347,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) /* Fail the current tokenization route. Discards the current - stack/context/textbuffer and raises a BadRoute exception. + stack/context/textbuffer and sets the BAD_ROUTE flag. */ static void* Tokenizer_fail_route(Tokenizer* self) { @@ -2681,7 +2681,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) */ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { - PyObject *text, *temp; + PyObject *text, *temp, *tokens; int context = 0, skip_style_tags = 0; if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { @@ -2704,13 +2704,29 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) Py_XDECREF(temp); self->text = text; } + self->head = self->global = self->depth = self->cycles = 0; self->length = PyList_GET_SIZE(self->text); self->skip_style_tags = skip_style_tags; - return Tokenizer_parse(self, context, 1); + tokens = Tokenizer_parse(self, context, 1); + + if (!tokens && !PyErr_Occurred()) { + if (!ParserError) { + if (load_exceptions()) + return NULL; + } + if (BAD_ROUTE) { + RESET_ROUTE(); + PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); + } + else + PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); + return NULL; + } + return tokens; } -static int load_entitydefs(void) +static int load_entities(void) { PyObject *tempmod, *defmap, *deflist; unsigned numdefs, i; @@ -2814,7 +2830,7 @@ static int load_tokens(void) return 0; } -static int load_definitions(void) +static int load_defs(void) { PyObject *tempmod, *globals = PyEval_GetGlobals(), @@ -2835,6 +2851,29 @@ static int load_definitions(void) return 0; } +static int load_exceptions(void) +{ + PyObject *tempmod, *parsermod, + *globals = PyEval_GetGlobals(), + *locals = PyEval_GetLocals(), + *fromlist = PyList_New(1), + *modname = IMPORT_NAME_FUNC("parser"); + char *name = "mwparserfromhell"; + + if (!fromlist || !modname) + return -1; + PyList_SET_ITEM(fromlist, 0, modname); + tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); + Py_DECREF(fromlist); + if (!tempmod) + return -1; + parsermod = PyObject_GetAttrString(tempmod, "parser"); + Py_DECREF(tempmod); + ParserError = PyObject_GetAttrString(parsermod, "ParserError"); + Py_DECREF(parsermod); + return 0; +} + PyMODINIT_FUNC INIT_FUNC_NAME(void) { PyObject *module; @@ -2851,9 +2890,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); EMPTY = PyUnicode_FromString(""); NOARGS = PyTuple_New(0); - if (!EMPTY || !NOARGS) - INIT_ERROR; - if (load_entitydefs() || load_tokens() || load_definitions()) + if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs()) INIT_ERROR; #ifdef IS_PY3K return module; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 032480d..4312e2f 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -62,6 +62,7 @@ static char** entitydefs; static PyObject* EMPTY; static PyObject* NOARGS; +static PyObject* ParserError; static PyObject* definitions; @@ -268,6 +269,8 @@ static int Tokenizer_parse_tag(Tokenizer*); static PyObject* Tokenizer_parse(Tokenizer*, int, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); +static int load_exceptions(void); + /* Macros for Python 2/3 compatibility: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e69a823..9af9204 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -24,7 +24,7 @@ from __future__ import unicode_literals from math import log import re -from . import contexts, tokens +from . import contexts, tokens, ParserError from ..compat import htmlentities, range from ..definitions import (get_html_tag, is_parsable, is_single, is_single_only, is_scheme) @@ -1154,4 +1154,7 @@ class Tokenizer(object): split = self.regex.split(text) self._text = [segment for segment in split if segment] self._head = self._global = self._depth = self._cycles = 0 - return self._parse(context) + try: + return self._parse(context) + except BadRoute: # pragma: no cover (untestable/exceptional case) + raise ParserError("Python tokenizer exited with BadRoute") diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 40e5158..c7cc3ef 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -34,7 +34,7 @@ from ..compat import py3k, str __all__ = ["Token"] -class Token (dict): +class Token(dict): """A token stores the semantic meaning of a unit of wikicode.""" def __repr__(self): diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index fd54ad0..8dc5e4e 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -66,7 +66,7 @@ def parse_anything(value, context=0, skip_style_tags=False): nodelist = SmartList() for item in value: nodelist += parse_anything(item, context, skip_style_tags).nodes + return Wikicode(nodelist) except TypeError: error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" raise ValueError(error.format(type(value).__name__, value)) - return Wikicode(nodelist) diff --git a/tests/test_builder.py b/tests/test_builder.py index c8fdca3..ed306f7 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -30,7 +30,7 @@ except ImportError: from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter -from mwparserfromhell.parser import tokens +from mwparserfromhell.parser import tokens, ParserError from mwparserfromhell.parser.builder import Builder from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext @@ -420,5 +420,11 @@ class TestBuilder(TreeEqualityTestCase): named=True)]))])]) self.assertWikicodeEqual(valid, self.builder.build(test)) + def test_parser_error(self): + """test whether ParserError gets thrown for bad input""" + msg = r"_handle_token\(\) got unexpected TemplateClose" + self.assertRaisesRegexp( + ParserError, msg, self.builder.build, [tokens.TemplateClose()]) + if __name__ == "__main__": unittest.main(verbosity=2) From 8bc7ea669da21e9a17e5bc94cbb4329db1220315 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 4 Jul 2014 20:47:08 -0400 Subject: [PATCH 028/102] Improve test coverage; fix some node-related bugs. * Parameters with non-integer keys can no longer be created with showkey=False, nor have the value of this attribute be set to False later. * Calling Template.remove() with a Parameter object that is not part of the template now raises ValueError instead of doing nothing. * Added tests for HTMLEntity._unichr() being called with out-of-range codepoints. * Added tests for Tag.__children__() and Tag.__showtree__() involving attributes that have no values. --- .coveragerc | 4 + CHANGELOG | 5 + docs/changelog.rst | 7 ++ mwparserfromhell/nodes/extras/parameter.py | 13 ++- mwparserfromhell/nodes/html_entity.py | 22 ++-- mwparserfromhell/nodes/template.py | 11 +- tests/test_builder.py | 5 +- tests/test_html_entity.py | 5 + tests/test_parameter.py | 5 +- tests/test_tag.py | 18 ++-- tests/test_template.py | 159 ++++++++++++++--------------- 11 files changed, 140 insertions(+), 114 deletions(-) diff --git a/.coveragerc b/.coveragerc index 0a92f19..909a0e2 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,3 +2,7 @@ exclude_lines = pragma: no cover raise NotImplementedError() +partial_branches = + pragma: no branch + if py3k: + if not py3k: diff --git a/CHANGELOG b/CHANGELOG index d733cee..1200575 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -10,10 +10,15 @@ v0.4 (unreleased): option, RECURSE_OTHERS, which recurses over all children except instances of 'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` returns all un-nested templates). +- Calling Template.remove() with a Parameter object that is not part of the + template now raises ValueError instead of doing nothing. +- Parameters with non-integer keys can no longer be created with + 'showkey=False', nor have the value of this attribute be set to False later. - If something goes wrong while parsing, ParserError will now be raised. Previously, the parser would produce an unclear BadRoute exception or allow an incorrect node tree to be build. - Fixed a parser bug involving nested tags. +- Test coverage has been improved, and some minor related bugs have been fixed. - Updated and fixed some documentation. v0.3.3 (released April 22, 2014): diff --git a/docs/changelog.rst b/docs/changelog.rst index a530733..ba26722 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -18,10 +18,17 @@ Unreleased which recurses over all children except instances of *forcetype* (for example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested templates). +- Calling :py:meth:`.Template.remove` with a :py:class:`.Parameter` object that + is not part of the template now raises :py:exc:`ValueError` instead of doing + nothing. +- :py:class:`.Parameter`\ s with non-integer keys can no longer be created with + *showkey=False*, nor have the value of this attribute be set to *False* + later. - If something goes wrong while parsing, :py:exc:`.ParserError` will now be raised. Previously, the parser would produce an unclear :py:exc:`.BadRoute` exception or allow an incorrect node tree to be build. - Fixed a parser bug involving nested tags. +- Test coverage has been improved, and some minor related bugs have been fixed. - Updated and fixed some documentation. v0.3.3 diff --git a/mwparserfromhell/nodes/extras/parameter.py b/mwparserfromhell/nodes/extras/parameter.py index e273af9..5a67ae0 100644 --- a/mwparserfromhell/nodes/extras/parameter.py +++ b/mwparserfromhell/nodes/extras/parameter.py @@ -21,6 +21,7 @@ # SOFTWARE. from __future__ import unicode_literals +import re from ...compat import str from ...string_mixin import StringMixIn @@ -39,6 +40,8 @@ class Parameter(StringMixIn): def __init__(self, name, value, showkey=True): super(Parameter, self).__init__() + if not showkey and not self.can_hide_key(name): + raise ValueError("key {0!r} cannot be hidden".format(name)) self._name = name self._value = value self._showkey = showkey @@ -48,6 +51,11 @@ class Parameter(StringMixIn): return str(self.name) + "=" + str(self.value) return str(self.value) + @staticmethod + def can_hide_key(key): + """Return whether or not the given key can be hidden.""" + return re.match(r"[1-9][0-9]*$", str(key).strip()) + @property def name(self): """The name of the parameter as a :py:class:`~.Wikicode` object.""" @@ -73,4 +81,7 @@ class Parameter(StringMixIn): @showkey.setter def showkey(self, newval): - self._showkey = bool(newval) + newval = bool(newval) + if not newval and not self.can_hide_key(self.name): + raise ValueError("parameter key cannot be hidden") + self._showkey = newval diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index c75cb99..95f1492 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -77,17 +77,17 @@ class HTMLEntity(Node): # Test whether we're on the wide or narrow Python build. Check # the length of a non-BMP code point # (U+1F64A, SPEAK-NO-EVIL MONKEY): - if len("\U0001F64A") == 2: - # Ensure this is within the range we can encode: - if value > 0x10FFFF: - raise ValueError("unichr() arg not in range(0x110000)") - code = value - 0x10000 - if value < 0: # Invalid code point - raise - lead = 0xD800 + (code >> 10) - trail = 0xDC00 + (code % (1 << 10)) - return unichr(lead) + unichr(trail) - raise + if len("\U0001F64A") == 1: # pragma: no cover + raise + # Ensure this is within the range we can encode: + if value > 0x10FFFF: + raise ValueError("unichr() arg not in range(0x110000)") + code = value - 0x10000 + if value < 0: # Invalid code point + raise + lead = 0xD800 + (code >> 10) + trail = 0xDC00 + (code % (1 << 10)) + return unichr(lead) + unichr(trail) @property def value(self): diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 3b5b35c..c0fda5d 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -155,6 +155,7 @@ class Template(Node): else: self.params.pop(i) return + raise ValueError(needle) @property def name(self): @@ -254,21 +255,19 @@ class Template(Node): return existing if showkey is None: - try: + if Parameter.can_hide_key(name): int_name = int(str(name)) - except ValueError: - showkey = True - else: int_keys = set() for param in self.params: if not param.showkey: - if re.match(r"[1-9][0-9]*$", param.name.strip()): - int_keys.add(int(str(param.name))) + int_keys.add(int(str(param.name))) expected = min(set(range(1, len(int_keys) + 2)) - int_keys) if expected == int_name: showkey = False else: showkey = True + else: + showkey = True if not showkey: self._surface_escape(value, "=") diff --git a/tests/test_builder.py b/tests/test_builder.py index ed306f7..58e3d1e 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -27,6 +27,7 @@ try: except ImportError: import unittest +from mwparserfromhell.compat import py3k from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter @@ -422,9 +423,9 @@ class TestBuilder(TreeEqualityTestCase): def test_parser_error(self): """test whether ParserError gets thrown for bad input""" + func = self.assertRaisesRegex if py3k else self.assertRaisesRegexp msg = r"_handle_token\(\) got unexpected TemplateClose" - self.assertRaisesRegexp( - ParserError, msg, self.builder.build, [tokens.TemplateClose()]) + func(ParserError, msg, self.builder.build, [tokens.TemplateClose()]) if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/tests/test_html_entity.py b/tests/test_html_entity.py index eb6f606..3df596a 100644 --- a/tests/test_html_entity.py +++ b/tests/test_html_entity.py @@ -108,6 +108,7 @@ class TestHTMLEntity(TreeEqualityTestCase): self.assertRaises(ValueError, setattr, node3, "value", -1) self.assertRaises(ValueError, setattr, node1, "value", 110000) self.assertRaises(ValueError, setattr, node1, "value", "1114112") + self.assertRaises(ValueError, setattr, node1, "value", "12FFFF") def test_named(self): """test getter/setter for the named attribute""" @@ -163,10 +164,14 @@ class TestHTMLEntity(TreeEqualityTestCase): node2 = HTMLEntity("107") node3 = HTMLEntity("e9") node4 = HTMLEntity("1f648") + node5 = HTMLEntity("-2") + node6 = HTMLEntity("110000", named=False, hexadecimal=True) self.assertEqual("\xa0", node1.normalize()) self.assertEqual("k", node2.normalize()) self.assertEqual("é", node3.normalize()) self.assertEqual("\U0001F648", node4.normalize()) + self.assertRaises(ValueError, node5.normalize) + self.assertRaises(ValueError, node6.normalize) if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/tests/test_parameter.py b/tests/test_parameter.py index ee52b59..2a4bb75 100644 --- a/tests/test_parameter.py +++ b/tests/test_parameter.py @@ -71,9 +71,10 @@ class TestParameter(TreeEqualityTestCase): self.assertFalse(node1.showkey) self.assertTrue(node2.showkey) node1.showkey = True - node2.showkey = "" self.assertTrue(node1.showkey) - self.assertFalse(node2.showkey) + node1.showkey = "" + self.assertFalse(node1.showkey) + self.assertRaises(ValueError, setattr, node2, "showkey", False) if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/tests/test_tag.py b/tests/test_tag.py index 111511a..0eae713 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -33,6 +33,7 @@ from mwparserfromhell.nodes.extras import Attribute from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext agen = lambda name, value: Attribute(wraptext(name), wraptext(value)) +agennv = lambda name: Attribute(wraptext(name)) agennq = lambda name, value: Attribute(wraptext(name), wraptext(value), False) agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, True, a, b, c) agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, True, a, b, c) @@ -74,10 +75,10 @@ class TestTag(TreeEqualityTestCase): node1 = Tag(wraptext("ref"), wraptext("foobar")) # '''bold text''' node2 = Tag(wraptext("b"), wraptext("bold text"), wiki_markup="'''") - # + # node3 = Tag(wraptext("img"), - attrs=[Attribute(wraptext("id"), wraptext("foo")), - Attribute(wraptext("class"), wraptext("bar"))], + attrs=[agen("id", "foo"), agen("class", "bar"), + agennv("selected")], self_closing=True, padding=" ") gen1 = node1.__children__() @@ -89,6 +90,7 @@ class TestTag(TreeEqualityTestCase): self.assertEqual(node3.attributes[0].value, next(gen3)) self.assertEqual(node3.attributes[1].name, next(gen3)) self.assertEqual(node3.attributes[1].value, next(gen3)) + self.assertEqual(node3.attributes[2].name, next(gen3)) self.assertEqual(node1.contents, next(gen1)) self.assertEqual(node2.contents, next(gen2)) self.assertEqual(node1.closing_tag, next(gen1)) @@ -113,7 +115,8 @@ class TestTag(TreeEqualityTestCase): getter, marker = object(), object() get = lambda code: output.append((getter, code)) mark = lambda: output.append(marker) - node1 = Tag(wraptext("ref"), wraptext("text"), [agen("name", "foo")]) + node1 = Tag(wraptext("ref"), wraptext("text"), + [agen("name", "foo"), agennv("selected")]) node2 = Tag(wraptext("br"), self_closing=True, padding=" ") node3 = Tag(wraptext("br"), self_closing=True, invalid=True, implicit=True, padding=" ") @@ -122,9 +125,10 @@ class TestTag(TreeEqualityTestCase): node3.__showtree__(output.append, get, mark) valid = [ "<", (getter, node1.tag), (getter, node1.attributes[0].name), - " = ", marker, (getter, node1.attributes[0].value), ">", - (getter, node1.contents), "", - "<", (getter, node2.tag), "/>", ""] + " = ", marker, (getter, node1.attributes[0].value), + (getter, node1.attributes[1].name), ">", (getter, node1.contents), + "", "<", (getter, node2.tag), + "/>", ""] self.assertEqual(valid, output) def test_tag(self): diff --git a/tests/test_template.py b/tests/test_template.py index 584b02f..e015a6a 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -130,6 +130,8 @@ class TestTemplate(TreeEqualityTestCase): self.assertTrue(node4.has("b", False)) self.assertTrue(node3.has("b", True)) self.assertFalse(node4.has("b", True)) + self.assertFalse(node1.has_param("foobar", False)) + self.assertTrue(node2.has_param(1, False)) def test_get(self): """test Template.get()""" @@ -176,52 +178,41 @@ class TestTemplate(TreeEqualityTestCase): pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")]) node16 = Template(wraptext("a"), [ pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]) - node17 = Template(wraptext("a"), [ - pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]) - node18 = Template(wraptext("a\n"), [ - pgens("b ", "c\n"), pgens("d ", " e"), pgens("f ", "g\n"), - pgens("h ", " i\n")]) - node19 = Template(wraptext("a"), [ - pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")]) - node20 = Template(wraptext("a"), [ - pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]) - node21 = Template(wraptext("a"), [pgenh("1", "b")]) - node22 = Template(wraptext("a"), [pgenh("1", "b")]) - node23 = Template(wraptext("a"), [pgenh("1", "b")]) - node24 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"), + node17 = Template(wraptext("a"), [pgenh("1", "b")]) + node18 = Template(wraptext("a"), [pgenh("1", "b")]) + node19 = Template(wraptext("a"), [pgenh("1", "b")]) + node20 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"), pgenh("3", "d"), pgenh("4", "e")]) - node25 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"), + node21 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"), pgens("4", "d"), pgens("5", "e")]) - node26 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"), + node22 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"), pgens("4", "d"), pgens("5", "e")]) + node23 = Template(wraptext("a"), [pgenh("1", "b")]) + node24 = Template(wraptext("a"), [pgenh("1", "b")]) + node25 = Template(wraptext("a"), [pgens("b", "c")]) + node26 = Template(wraptext("a"), [pgenh("1", "b")]) node27 = Template(wraptext("a"), [pgenh("1", "b")]) - node28 = Template(wraptext("a"), [pgenh("1", "b")]) - node29 = Template(wraptext("a"), [pgens("b", "c")]) - node30 = Template(wraptext("a"), [pgenh("1", "b")]) - node31 = Template(wraptext("a"), [pgenh("1", "b")]) - node32 = Template(wraptext("a"), [pgens("1", "b")]) - node33 = Template(wraptext("a"), [ + node28 = Template(wraptext("a"), [pgens("1", "b")]) + node29 = Template(wraptext("a"), [ pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]) - node34 = Template(wraptext("a\n"), [ + node30 = Template(wraptext("a\n"), [ pgens("b ", "c\n"), pgens("d ", " e"), pgens("f ", "g\n"), pgens("h ", " i\n")]) - node35 = Template(wraptext("a"), [ + node31 = Template(wraptext("a"), [ pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")]) - node36 = Template(wraptext("a"), [ + node32 = Template(wraptext("a"), [ pgens("\nb ", " c "), pgens("\nd ", " e "), pgens("\nf ", " g ")]) - node37 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"), - pgens("b", "f"), pgens("b", "h"), - pgens("i", "j")]) - node37 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"), + node33 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"), pgens("b", "f"), pgens("b", "h"), pgens("i", "j")]) - node38 = Template(wraptext("a"), [pgens("1", "b"), pgens("x", "y"), + node34 = Template(wraptext("a"), [pgens("1", "b"), pgens("x", "y"), pgens("1", "c"), pgens("2", "d")]) - node39 = Template(wraptext("a"), [pgens("1", "b"), pgens("x", "y"), + node35 = Template(wraptext("a"), [pgens("1", "b"), pgens("x", "y"), pgenh("1", "c"), pgenh("2", "d")]) - node40 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"), + node36 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"), pgens("f", "g")]) - node41 = Template(wraptext("a"), [pgenh("1", "")]) + node37 = Template(wraptext("a"), [pgenh("1", "")]) + node38 = Template(wraptext("abc")) node1.add("e", "f", showkey=True) node2.add(2, "g", showkey=False) @@ -241,31 +232,29 @@ class TestTemplate(TreeEqualityTestCase): node14.add("j", "k", showkey=True) node15.add("h", "i", showkey=True) node16.add("h", "i", showkey=True, preserve_spacing=False) - node17.add("h", "i", showkey=False) - node18.add("j", "k", showkey=False) - node19.add("h", "i", showkey=False) - node20.add("h", "i", showkey=False, preserve_spacing=False) - node21.add("2", "c") - node22.add("3", "c") - node23.add("c", "d") - node24.add("5", "f") - node25.add("3", "f") - node26.add("6", "f") - node27.add("c", "foo=bar") - node28.add("2", "foo=bar") - node29.add("b", "d") - node30.add("1", "foo=bar") - node31.add("1", "foo=bar", showkey=True) - node32.add("1", "foo=bar", showkey=False) - node33.add("d", "foo") - node34.add("f", "foo") - node35.add("f", "foo") - node36.add("d", "foo", preserve_spacing=False) - node37.add("b", "k") - node38.add("1", "e") - node39.add("1", "e") - node40.add("d", "h", before="b") - node41.add(1, "b") + node17.add("2", "c") + node18.add("3", "c") + node19.add("c", "d") + node20.add("5", "f") + node21.add("3", "f") + node22.add("6", "f") + node23.add("c", "foo=bar") + node24.add("2", "foo=bar") + node25.add("b", "d") + node26.add("1", "foo=bar") + node27.add("1", "foo=bar", showkey=True) + node28.add("1", "foo=bar", showkey=False) + node29.add("d", "foo") + node30.add("f", "foo") + node31.add("f", "foo") + node32.add("d", "foo", preserve_spacing=False) + node33.add("b", "k") + node34.add("1", "e") + node35.add("1", "e") + node36.add("d", "h", before="b") + node37.add(1, "b") + node38.add("1", "foo") + self.assertRaises(ValueError, node38.add, "z", "bar", showkey=False) self.assertEqual("{{a|b=c|d|e=f}}", node1) self.assertEqual("{{a|b=c|d|g}}", node2) @@ -285,34 +274,31 @@ class TestTemplate(TreeEqualityTestCase): self.assertEqual("{{a\n|b =c\n|d = e|f =g\n|h = i\n|j =k\n}}", node14) self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |h =i}}", node15) self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|h=i}}", node16) - self.assertEqual("{{a|\nb = c|\nd = e|\nf = g| i}}", node17) - self.assertEqual("{{a\n|b =c\n|d = e|f =g\n|h = i\n|k\n}}", node18) - self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |i}}", node19) - self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|i}}", node20) - self.assertEqual("{{a|b|c}}", node21) - self.assertEqual("{{a|b|3=c}}", node22) - self.assertEqual("{{a|b|c=d}}", node23) - self.assertEqual("{{a|b|c|d|e|f}}", node24) - self.assertEqual("{{a|b|c|4=d|5=e|f}}", node25) - self.assertEqual("{{a|b|c|4=d|5=e|6=f}}", node26) - self.assertEqual("{{a|b|c=foo=bar}}", node27) - self.assertEqual("{{a|b|foo=bar}}", node28) - self.assertIsInstance(node28.params[1].value.get(1), HTMLEntity) - self.assertEqual("{{a|b=d}}", node29) - self.assertEqual("{{a|foo=bar}}", node30) - self.assertIsInstance(node30.params[0].value.get(1), HTMLEntity) - self.assertEqual("{{a|1=foo=bar}}", node31) - self.assertEqual("{{a|foo=bar}}", node32) - self.assertIsInstance(node32.params[0].value.get(1), HTMLEntity) - self.assertEqual("{{a|\nb = c|\nd = foo|\nf = g}}", node33) - self.assertEqual("{{a\n|b =c\n|d = e|f =foo\n|h = i\n}}", node34) - self.assertEqual("{{a|b = c\n|\nd = e|\nf =foo }}", node35) - self.assertEqual("{{a|\nb = c |\nd =foo|\nf = g }}", node36) - self.assertEqual("{{a|b=k|d=e|i=j}}", node37) - self.assertEqual("{{a|1=e|x=y|2=d}}", node38) - self.assertEqual("{{a|x=y|e|d}}", node39) - self.assertEqual("{{a|b=c|d=h|f=g}}", node40) - self.assertEqual("{{a|b}}", node41) + self.assertEqual("{{a|b|c}}", node17) + self.assertEqual("{{a|b|3=c}}", node18) + self.assertEqual("{{a|b|c=d}}", node19) + self.assertEqual("{{a|b|c|d|e|f}}", node20) + self.assertEqual("{{a|b|c|4=d|5=e|f}}", node21) + self.assertEqual("{{a|b|c|4=d|5=e|6=f}}", node22) + self.assertEqual("{{a|b|c=foo=bar}}", node23) + self.assertEqual("{{a|b|foo=bar}}", node24) + self.assertIsInstance(node24.params[1].value.get(1), HTMLEntity) + self.assertEqual("{{a|b=d}}", node25) + self.assertEqual("{{a|foo=bar}}", node26) + self.assertIsInstance(node26.params[0].value.get(1), HTMLEntity) + self.assertEqual("{{a|1=foo=bar}}", node27) + self.assertEqual("{{a|foo=bar}}", node28) + self.assertIsInstance(node28.params[0].value.get(1), HTMLEntity) + self.assertEqual("{{a|\nb = c|\nd = foo|\nf = g}}", node29) + self.assertEqual("{{a\n|b =c\n|d = e|f =foo\n|h = i\n}}", node30) + self.assertEqual("{{a|b = c\n|\nd = e|\nf =foo }}", node31) + self.assertEqual("{{a|\nb = c |\nd =foo|\nf = g }}", node32) + self.assertEqual("{{a|b=k|d=e|i=j}}", node33) + self.assertEqual("{{a|1=e|x=y|2=d}}", node34) + self.assertEqual("{{a|x=y|e|d}}", node35) + self.assertEqual("{{a|b=c|d=h|f=g}}", node36) + self.assertEqual("{{a|b}}", node37) + self.assertEqual("{{abc|foo}}", node38) def test_remove(self): """test Template.remove()""" @@ -373,6 +359,8 @@ class TestTemplate(TreeEqualityTestCase): node26 = Template(wraptext("foo"), [ pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"), pgens("a", "b")]) + node27 = Template(wraptext("foo"), [pgenh("1", "bar")]) + node28 = Template(wraptext("foo"), [pgenh("1", "bar")]) node2.remove("1") node2.remove("abc") @@ -430,6 +418,7 @@ class TestTemplate(TreeEqualityTestCase): self.assertEqual("{{foo|a=|c=d|e=f|a=b|a=b}}", node24) self.assertEqual("{{foo|a=b|c=d|e=f|a=b}}", node25) self.assertEqual("{{foo|a=b|c=d|e=f|a=|a=b}}", node26) + self.assertRaises(ValueError, node27.remove, node28.get(1)) if __name__ == "__main__": unittest.main(verbosity=2) From a4c2fd023adfe95fdd5552cc2bab90a0bbc16a2a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 5 Jul 2014 01:00:11 -0400 Subject: [PATCH 029/102] Remove some useless code in the tokenizers. --- mwparserfromhell/parser/tokenizer.c | 4 +--- mwparserfromhell/parser/tokenizer.py | 8 +++----- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 6ab8570..963e7d7 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -832,8 +832,6 @@ static int Tokenizer_parse_wikilink(Tokenizer* self) Py_DECREF(wikilink); if (Tokenizer_emit(self, WikilinkClose)) return -1; - if (self->topstack->context & LC_FAIL_NEXT) - self->topstack->context ^= LC_FAIL_NEXT; return 0; } @@ -1718,7 +1716,7 @@ Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk) return -1; } } - else if (data->context & TAG_ATTR_VALUE) { + else { // data->context & TAG_ATTR_VALUE assured escaped = (Tokenizer_READ_BACKWARDS(self, 1) == '\\' && Tokenizer_READ_BACKWARDS(self, 2) != '\\'); if (data->context & TAG_NOTE_QUOTE) { diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9af9204..6430f0f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -255,7 +255,7 @@ class Tokenizer(object): self._context ^= contexts.TEMPLATE_NAME elif self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE - elif self._context & contexts.TEMPLATE_PARAM_KEY: + else: self._emit_all(self._pop(keep_context=True)) self._context |= contexts.TEMPLATE_PARAM_KEY self._emit(tokens.TemplateParamSeparator()) @@ -296,8 +296,6 @@ class Tokenizer(object): self._head = reset self._emit_text("[[") else: - if self._context & contexts.FAIL_NEXT: - self._context ^= contexts.FAIL_NEXT self._emit(tokens.WikilinkOpen()) self._emit_all(wikilink) self._emit(tokens.WikilinkClose()) @@ -687,7 +685,7 @@ class Tokenizer(object): self._push_tag_buffer(data) data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - elif data.context & data.CX_ATTR_VALUE: + else: # data.context & data.CX_ATTR_VALUE assured escaped = self._read(-1) == "\\" and self._read(-2) != "\\" if data.context & data.CX_NOTE_QUOTE: data.context ^= data.CX_NOTE_QUOTE @@ -943,7 +941,7 @@ class Tokenizer(object): elif ticks == 3: if self._parse_bold(): return self._pop() - elif ticks == 5: + else: # ticks == 5 self._parse_italics_and_bold() self._head -= 1 From b997e4cd7131b541dbf9027dbf67ebc46ed356ea Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 5 Jul 2014 04:21:56 -0400 Subject: [PATCH 030/102] Support attributes quoted with '; add required quotes in value setter. --- CHANGELOG | 4 +++ docs/changelog.rst | 5 +++ mwparserfromhell/nodes/extras/attribute.py | 51 +++++++++++++++++++------- mwparserfromhell/nodes/tag.py | 17 +++++---- mwparserfromhell/parser/builder.py | 6 ++-- mwparserfromhell/parser/tokenizer.c | 21 +++++++---- mwparserfromhell/parser/tokenizer.h | 1 + mwparserfromhell/parser/tokenizer.py | 12 ++++--- mwparserfromhell/parser/tokens.py | 2 +- tests/_test_tree_equality.py | 2 +- tests/test_attribute.py | 48 +++++++++++++++++-------- tests/test_builder.py | 16 ++++----- tests/test_tag.py | 29 ++++++++------- tests/tokenizer/integration.mwtest | 2 +- tests/tokenizer/tags.mwtest | 58 ++++++++++++++++++++++-------- 15 files changed, 189 insertions(+), 85 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 1200575..f7dcb8a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -10,6 +10,10 @@ v0.4 (unreleased): option, RECURSE_OTHERS, which recurses over all children except instances of 'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)` returns all un-nested templates). +- The parser now understands HTML tag attributes quoted with single quotes. + When setting a tag attribute's value, quotes will be added if necessary. As + part of this, Attribute's 'quoted' attribute has been changed to 'quotes', + and is now either a string or None. - Calling Template.remove() with a Parameter object that is not part of the template now raises ValueError instead of doing nothing. - Parameters with non-integer keys can no longer be created with diff --git a/docs/changelog.rst b/docs/changelog.rst index ba26722..3bc4ce7 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -18,6 +18,11 @@ Unreleased which recurses over all children except instances of *forcetype* (for example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested templates). +- The parser now understands HTML tag attributes quoted with single quotes. + When setting a tag attribute's value, quotes will be added if necessary. As + part of this, :py:class:`.Attribute`\ 's :py:attr:`~.Attribute.quoted` + attribute has been changed to :py:attr:`~.Attribute.quotes`, and is now + either a string or ``None``. - Calling :py:meth:`.Template.remove` with a :py:class:`.Parameter` object that is not part of the template now raises :py:exc:`ValueError` instead of doing nothing. diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 4b7c668..6256138 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -36,12 +36,14 @@ class Attribute(StringMixIn): whose value is ``"foo"``. """ - def __init__(self, name, value=None, quoted=True, pad_first=" ", + def __init__(self, name, value=None, quotes='"', pad_first=" ", pad_before_eq="", pad_after_eq=""): super(Attribute, self).__init__() + if not quotes and self._value_needs_quotes(value): + raise ValueError("given value {0!r} requires quotes".format(value)) self._name = name self._value = value - self._quoted = quoted + self._quotes = quotes self._pad_first = pad_first self._pad_before_eq = pad_before_eq self._pad_after_eq = pad_after_eq @@ -50,11 +52,18 @@ class Attribute(StringMixIn): result = self.pad_first + str(self.name) + self.pad_before_eq if self.value is not None: result += "=" + self.pad_after_eq - if self.quoted: - return result + '"' + str(self.value) + '"' + if self.quotes: + return result + self.quotes + str(self.value) + self.quotes return result + str(self.value) return result + @staticmethod + def _value_needs_quotes(val): + """Return the preferred quotes for the given value, or None.""" + if val and any(char.isspace() for char in val): + return ('"' in val and "'" in val) or ("'" if '"' in val else '"') + return None + def _set_padding(self, attr, value): """Setter for the value of a padding attribute.""" if not value: @@ -65,6 +74,14 @@ class Attribute(StringMixIn): raise ValueError("padding must be entirely whitespace") setattr(self, attr, value) + @staticmethod + def coerce_quotes(quotes): + """Coerce a quote type into an acceptable value, or raise an error.""" + orig, quotes = quotes, str(quotes) if quotes else None + if quotes not in [None, '"', "'"]: + raise ValueError("{0!r} is not a valid quote type".format(orig)) + return quotes + @property def name(self): """The name of the attribute as a :py:class:`~.Wikicode` object.""" @@ -76,9 +93,9 @@ class Attribute(StringMixIn): return self._value @property - def quoted(self): - """Whether the attribute's value is quoted with double quotes.""" - return self._quoted + def quotes(self): + """How to enclose the attribute value. ``"``, ``'``, or ``None``.""" + return self._quotes @property def pad_first(self): @@ -101,11 +118,21 @@ class Attribute(StringMixIn): @value.setter def value(self, newval): - self._value = None if newval is None else parse_anything(newval) - - @quoted.setter - def quoted(self, value): - self._quoted = bool(value) + if newval is None: + self._value = None + else: + code = parse_anything(newval) + quotes = self._value_needs_quotes(code) + if quotes in ['"', "'"] or (quotes is True and not self.quotes): + self._quotes = quotes + self._value = code + + @quotes.setter + def quotes(self, value): + value = self.coerce_quotes(value) + if not value and self._value_needs_quotes(self.value): + raise ValueError("attribute value requires quotes") + self._quotes = value @pad_first.setter def pad_first(self, value): diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index f283d46..1b8efb8 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -236,21 +236,24 @@ class Tag(Node): return attr raise ValueError(name) - def add(self, name, value=None, quoted=True, pad_first=" ", + def add(self, name, value=None, quotes='"', pad_first=" ", pad_before_eq="", pad_after_eq=""): """Add an attribute with the given *name* and *value*. *name* and *value* can be anything parsable by :py:func:`.utils.parse_anything`; *value* can be omitted if the - attribute is valueless. *quoted* is a bool telling whether to wrap the - *value* in double quotes (this is recommended). *pad_first*, - *pad_before_eq*, and *pad_after_eq* are whitespace used as padding - before the name, before the equal sign (or after the name if no value), - and after the equal sign (ignored if no value), respectively. + attribute is valueless. If *quotes* is not ``None``, it should be a + string (either ``"`` or ``'``) that *value* will be wrapped in (this is + recommended). ``None`` is only legal if *value* contains no spacing. + + *pad_first*, *pad_before_eq*, and *pad_after_eq* are whitespace used as + padding before the name, before the equal sign (or after the name if no + value), and after the equal sign (ignored if no value), respectively. """ if value is not None: value = parse_anything(value) - attr = Attribute(parse_anything(name), value, quoted) + quotes = Attribute.coerce_quotes(quotes) + attr = Attribute(parse_anything(name), value, quotes) attr.pad_first = pad_first attr.pad_before_eq = pad_before_eq attr.pad_after_eq = pad_after_eq diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 559bd54..c9a930b 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -193,7 +193,7 @@ class Builder(object): def _handle_attribute(self, start): """Handle a case where a tag attribute is at the head of the tokens.""" - name, quoted = None, False + name = quotes = None self._push() while self._tokens: token = self._tokens.pop() @@ -201,7 +201,7 @@ class Builder(object): name = self._pop() self._push() elif isinstance(token, tokens.TagAttrQuote): - quoted = True + quotes = token.char elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen, tokens.TagCloseSelfclose)): self._tokens.append(token) @@ -209,7 +209,7 @@ class Builder(object): value = self._pop() else: name, value = self._pop(), None - return Attribute(name, value, quoted, start.pad_first, + return Attribute(name, value, quotes, start.pad_first, start.pad_before_eq, start.pad_after_eq) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 963e7d7..4c6414e 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -173,7 +173,7 @@ static TagData* TagData_new(void) ALLOC_BUFFER(self->pad_first) ALLOC_BUFFER(self->pad_before_eq) ALLOC_BUFFER(self->pad_after_eq) - self->reset = 0; + self->quoter = self->reset = 0; return self; } @@ -1566,10 +1566,18 @@ static int Tokenizer_parse_comment(Tokenizer* self) */ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) { - PyObject *tokens, *kwargs, *pad_first, *pad_before_eq, *pad_after_eq; + PyObject *tokens, *kwargs, *tmp, *pad_first, *pad_before_eq, *pad_after_eq; if (data->context & TAG_QUOTED) { - if (Tokenizer_emit_first(self, TagAttrQuote)) + kwargs = PyDict_New(); + if (!kwargs) + return -1; + tmp = PyUnicode_FromUnicode(&data->quoter, 1); + if (!tmp) + return -1; + PyDict_SetItemString(kwargs, "char", tmp); + Py_DECREF(tmp); + if (Tokenizer_emit_first_kwargs(self, TagAttrQuote, kwargs)) return -1; tokens = Tokenizer_pop(self); if (!tokens) @@ -1721,16 +1729,17 @@ Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk) Tokenizer_READ_BACKWARDS(self, 2) != '\\'); if (data->context & TAG_NOTE_QUOTE) { data->context ^= TAG_NOTE_QUOTE; - if (chunk == '"' && !escaped) { + if ((chunk == '"' || chunk == '\'') && !escaped) { data->context |= TAG_QUOTED; + data->quoter = chunk; + data->reset = self->head; if (Tokenizer_push(self, self->topstack->context)) return -1; - data->reset = self->head; return 0; } } else if (data->context & TAG_QUOTED) { - if (chunk == '"' && !escaped) { + if (chunk == data->quoter && !escaped) { data->context |= TAG_NOTE_SPACE; return 0; } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 4312e2f..dde6464 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -206,6 +206,7 @@ typedef struct { struct Textbuffer* pad_first; struct Textbuffer* pad_before_eq; struct Textbuffer* pad_after_eq; + Py_UNICODE quoter; Py_ssize_t reset; } TagData; diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 6430f0f..4422b5c 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -53,6 +53,7 @@ class _TagOpenData(object): def __init__(self): self.context = self.CX_NAME self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""} + self.quoter = None self.reset = 0 @@ -66,7 +67,7 @@ class Tokenizer(object): MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) - tag_splitter = re.compile(r"([\s\"\\]+)") + tag_splitter = re.compile(r"([\s\"\'\\]+)") def __init__(self): self._text = None @@ -612,7 +613,7 @@ class Tokenizer(object): def _push_tag_buffer(self, data): """Write a pending tag attribute from *data* to the stack.""" if data.context & data.CX_QUOTED: - self._emit_first(tokens.TagAttrQuote()) + self._emit_first(tokens.TagAttrQuote(char=data.quoter)) self._emit_all(self._pop()) buf = data.padding_buffer self._emit_first(tokens.TagAttrStart(pad_first=buf["first"], @@ -689,13 +690,14 @@ class Tokenizer(object): escaped = self._read(-1) == "\\" and self._read(-2) != "\\" if data.context & data.CX_NOTE_QUOTE: data.context ^= data.CX_NOTE_QUOTE - if chunk == '"' and not escaped: + if chunk in "'\"" and not escaped: data.context |= data.CX_QUOTED - self._push(self._context) + data.quoter = chunk data.reset = self._head + self._push(self._context) continue elif data.context & data.CX_QUOTED: - if chunk == '"' and not escaped: + if chunk == data.quoter and not escaped: data.context |= data.CX_NOTE_SPACE continue self._handle_tag_text(chunk) diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index c7cc3ef..e567731 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -100,7 +100,7 @@ CommentEnd = make("CommentEnd") # --> TagOpenOpen = make("TagOpenOpen") # < TagAttrStart = make("TagAttrStart") TagAttrEquals = make("TagAttrEquals") # = -TagAttrQuote = make("TagAttrQuote") # " +TagAttrQuote = make("TagAttrQuote") # ", ' TagCloseOpen = make("TagCloseOpen") # > TagCloseSelfclose = make("TagCloseSelfclose") # /> TagOpenClose = make("TagOpenClose") # [[Source]] + # mno = '{{p}} [[q]] {{r}}'>[[Source]] ([tokens.TagOpenOpen(), tokens.Text(text="ref"), tokens.TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), @@ -308,7 +308,7 @@ class TestBuilder(TreeEqualityTestCase): tokens.TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), tokens.Text(text="foo"), tokens.TagAttrEquals(), - tokens.TagAttrQuote(), tokens.Text(text="bar "), + tokens.TagAttrQuote(char='"'), tokens.Text(text="bar "), tokens.TemplateOpen(), tokens.Text(text="baz"), tokens.TemplateClose(), tokens.TagAttrStart(pad_first=" ", pad_before_eq="", @@ -326,7 +326,7 @@ class TestBuilder(TreeEqualityTestCase): tokens.TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), tokens.Text(text="mno"), tokens.TagAttrEquals(), - tokens.TagAttrQuote(), tokens.TemplateOpen(), + tokens.TagAttrQuote(char="'"), tokens.TemplateOpen(), tokens.Text(text="p"), tokens.TemplateClose(), tokens.Text(text=" "), tokens.WikilinkOpen(), tokens.Text(text="q"), tokens.WikilinkClose(), @@ -338,17 +338,17 @@ class TestBuilder(TreeEqualityTestCase): tokens.TagCloseClose()], wrap([Tag(wraptext("ref"), wrap([Wikilink(wraptext("Source"))]), [ Attribute(wraptext("name"), - wrap([Template(wraptext("abc"))]), False), + wrap([Template(wraptext("abc"))]), None), Attribute(wraptext("foo"), wrap([Text("bar "), Template(wraptext("baz"))]), pad_first=" "), Attribute(wraptext("abc"), wrap([Template(wraptext("de")), - Text("f")]), False), + Text("f")]), None), Attribute(wraptext("ghi"), wrap([Text("j"), Template(wraptext("k")), - Template(wraptext("l"))]), False), + Template(wraptext("l"))]), None), Attribute(wraptext("mno"), wrap([Template(wraptext("p")), Text(" "), Wikilink(wraptext("q")), Text(" "), - Template(wraptext("r"))]), True, " \n ", " ", + Template(wraptext("r"))]), "'", " \n ", " ", " ")])])), # "''italic text''" diff --git a/tests/test_tag.py b/tests/test_tag.py index 0eae713..7577cce 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -34,9 +34,9 @@ from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext agen = lambda name, value: Attribute(wraptext(name), wraptext(value)) agennv = lambda name: Attribute(wraptext(name)) -agennq = lambda name, value: Attribute(wraptext(name), wraptext(value), False) -agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, True, a, b, c) -agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, True, a, b, c) +agennq = lambda name, value: Attribute(wraptext(name), wraptext(value), None) +agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, '"', a, b, c) +agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, '"', a, b, c) class TestTag(TreeEqualityTestCase): """Test cases for the Tag node.""" @@ -276,28 +276,33 @@ class TestTag(TreeEqualityTestCase): """test Tag.add()""" node = Tag(wraptext("ref"), wraptext("cite")) node.add("name", "value") - node.add("name", "value", quoted=False) + node.add("name", "value", quotes=None) + node.add("name", "value", quotes="'") node.add("name") node.add(1, False) node.add("style", "{{foobar}}") - node.add("name", "value", True, "\n", " ", " ") + node.add("name", "value", '"', "\n", " ", " ") attr1 = ' name="value"' attr2 = " name=value" - attr3 = " name" - attr4 = ' 1="False"' - attr5 = ' style="{{foobar}}"' - attr6 = '\nname = "value"' + attr3 = " name='value'" + attr4 = " name" + attr5 = ' 1="False"' + attr6 = ' style="{{foobar}}"' + attr7 = '\nname = "value"' self.assertEqual(attr1, node.attributes[0]) self.assertEqual(attr2, node.attributes[1]) self.assertEqual(attr3, node.attributes[2]) self.assertEqual(attr4, node.attributes[3]) self.assertEqual(attr5, node.attributes[4]) self.assertEqual(attr6, node.attributes[5]) - self.assertEqual(attr6, node.get("name")) + self.assertEqual(attr7, node.attributes[6]) + self.assertEqual(attr7, node.get("name")) self.assertWikicodeEqual(wrap([Template(wraptext("foobar"))]), - node.attributes[4].value) + node.attributes[5].value) self.assertEqual("".join(("cite")), node) + attr6, attr7, ">cite")), node) + self.assertRaises(ValueError, node.add, "name", "foo", quotes="bar") + self.assertRaises(ValueError, node.add, "name", "a bc d", quotes=None) def test_remove(self): """test Tag.remove()""" diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 5e1a409..372a367 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -43,7 +43,7 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t name: rich_tags label: a HTML tag with tons of other things in it input: "{{dubious claim}}[[Source]]" -output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 26e569b..f979329 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -57,7 +57,14 @@ output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before name: attribute_quoted label: a tag with a single quoted attribute input: "" -output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_single_quoted +label: a tag with a single singly-quoted attribute +input: "" +output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="'"), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- @@ -71,7 +78,7 @@ output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before name: attribute_quoted_hyphen label: a tag with a single quoted attribute, containing a hyphen input: "" -output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- @@ -92,21 +99,21 @@ output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before name: attribute_selfclosing_value_quoted label: a self-closing tag with a single quoted attribute input: "" -output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(padding="")] --- name: nested_tag label: a tag nested within the attributes of another input: "foo>citation" -output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: nested_tag_quoted label: a tag nested within the attributes of another, quoted input: "foo">citation" -output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- @@ -120,7 +127,7 @@ output: [Text(text=">citation")] name: nested_troll_tag_quoted label: a bogus tag that appears to be nested within the attributes of another, quoted input: "citation" -output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text=""), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text=""), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- @@ -222,6 +229,27 @@ output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_befor --- +name: quotes_in_quotes +label: singly-quoted text inside a doubly-quoted attribute +input: "stuff" +output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="bar 'baz buzz' biz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] + +--- + +name: quotes_in_quotes_2 +label: doubly-quoted text inside a singly-quoted attribute +input: "stuff" +output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="'"), Text(text="bar \"baz buzz\" biz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] + +--- + +name: quotes_in_quotes_3 +label: doubly-quoted text inside a singly-quoted attribute, with backslashes +input: "stuff" +output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="'"), Text(text="bar \"baz buzz\\\" biz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] + +--- + name: incomplete_lbracket label: incomplete tags: just a left bracket input: "<" @@ -407,28 +435,28 @@ output: [Text(text="junk <>")] name: backslash_premature_before label: a backslash before a quote before a space input: "blah" -output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] --- name: backslash_premature_after label: a backslash before a quote after a space input: "blah" -output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] --- name: backslash_premature_middle label: a backslash before a quote in the middle of a word input: "blah" -output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] --- name: backslash_adjacent label: escaped quotes next to unescaped quotes input: "blah" -output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] --- @@ -442,21 +470,21 @@ output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before name: backslash_double label: two adjacent backslashes, which do *not* affect the quote input: "blah" -output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] --- name: backslash_triple label: three adjacent backslashes, which do *not* affect the quote input: "blah" -output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] --- name: backslash_unaffecting label: backslashes near quotes, but not immediately adjacent, thus having no effect input: "blah" -output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] +output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] --- @@ -477,7 +505,7 @@ output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(t name: unparsable_attributed label: a tag that should not be put through the normal parser; parsed attributes input: "{{t1}}{{t2}}{{t3}}" -output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()] +output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()] --- @@ -575,7 +603,7 @@ output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseS name: single_only_close_attribute label: a tag that can only be single; presented as a close tag with an attribute input: "
    " -output: [TagOpenOpen(invalid=True), Text(text="br"), TagAttrStart(pad_first=" ", pad_after_eq="", pad_before_eq=""), Text(text="id"), TagAttrEquals(), TagAttrQuote(), Text(text="break"), TagCloseSelfclose(padding="", implicit=True)] +output: [TagOpenOpen(invalid=True), Text(text="br"), TagAttrStart(pad_first=" ", pad_after_eq="", pad_before_eq=""), Text(text="id"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="break"), TagCloseSelfclose(padding="", implicit=True)] --- From 963cb2f780bb3b2918810b5fb260e347ae6cd1ff Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 5 Jul 2014 15:53:57 -0400 Subject: [PATCH 031/102] Store builder handlers in a dictionary. ~5% speedup. --- mwparserfromhell/parser/builder.py | 68 ++++++++++++++++++++++++-------------- tests/test_builder.py | 13 +++++++- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index c9a930b..e0109e6 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -32,6 +32,19 @@ from ..wikicode import Wikicode __all__ = ["Builder"] +_HANDLERS = { + tokens.Text: lambda self, token: Text(token.text) +} + +def _add_handler(token_type): + """Create a decorator that adds a handler function to the lookup table.""" + def decorator(func): + """Add a handler function to the lookup table.""" + _HANDLERS[token_type] = func + return func + return decorator + + class Builder(object): """Builds a tree of nodes out of a sequence of tokens. @@ -83,8 +96,10 @@ class Builder(object): return Parameter(key, value, showkey) else: self._write(self._handle_token(token)) + raise ParserError("_handle_parameter() missed a close token") - def _handle_template(self): + @_add_handler(tokens.TemplateOpen) + def _handle_template(self, token): """Handle a case where a template is at the head of the tokens.""" params = [] default = 1 @@ -104,8 +119,10 @@ class Builder(object): return Template(name, params) else: self._write(self._handle_token(token)) + raise ParserError("_handle_template() missed a close token") - def _handle_argument(self): + @_add_handler(tokens.ArgumentOpen) + def _handle_argument(self, token): """Handle a case where an argument is at the head of the tokens.""" name = None self._push() @@ -120,8 +137,10 @@ class Builder(object): return Argument(self._pop()) else: self._write(self._handle_token(token)) + raise ParserError("_handle_argument() missed a close token") - def _handle_wikilink(self): + @_add_handler(tokens.WikilinkOpen) + def _handle_wikilink(self, token): """Handle a case where a wikilink is at the head of the tokens.""" title = None self._push() @@ -136,7 +155,9 @@ class Builder(object): return Wikilink(self._pop()) else: self._write(self._handle_token(token)) + raise ParserError("_handle_wikilink() missed a close token") + @_add_handler(tokens.ExternalLinkOpen) def _handle_external_link(self, token): """Handle when an external link is at the head of the tokens.""" brackets, url = token.brackets, None @@ -152,8 +173,10 @@ class Builder(object): return ExternalLink(self._pop(), brackets=brackets) else: self._write(self._handle_token(token)) + raise ParserError("_handle_external_link() missed a close token") - def _handle_entity(self): + @_add_handler(tokens.HTMLEntityStart) + def _handle_entity(self, token): """Handle a case where an HTML entity is at the head of the tokens.""" token = self._tokens.pop() if isinstance(token, tokens.HTMLEntityNumeric): @@ -168,6 +191,7 @@ class Builder(object): self._tokens.pop() # Remove HTMLEntityEnd return HTMLEntity(token.text, named=True, hexadecimal=False) + @_add_handler(tokens.HeadingStart) def _handle_heading(self, token): """Handle a case where a heading is at the head of the tokens.""" level = token.level @@ -179,8 +203,10 @@ class Builder(object): return Heading(title, level) else: self._write(self._handle_token(token)) + raise ParserError("_handle_heading() missed a close token") - def _handle_comment(self): + @_add_handler(tokens.CommentStart) + def _handle_comment(self, token): """Handle a case where an HTML comment is at the head of the tokens.""" self._push() while self._tokens: @@ -190,6 +216,7 @@ class Builder(object): return Comment(contents) else: self._write(self._handle_token(token)) + raise ParserError("_handle_comment() missed a close token") def _handle_attribute(self, start): """Handle a case where a tag attribute is at the head of the tokens.""" @@ -213,7 +240,9 @@ class Builder(object): start.pad_before_eq, start.pad_after_eq) else: self._write(self._handle_token(token)) + raise ParserError("_handle_attribute() missed a close token") + @_add_handler(tokens.TagOpenOpen) def _handle_tag(self, token): """Handle a case where a tag is at the head of the tokens.""" close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) @@ -244,29 +273,15 @@ class Builder(object): invalid, implicit, padding, closing_tag) else: self._write(self._handle_token(token)) + raise ParserError("_handle_tag() missed a close token") def _handle_token(self, token): """Handle a single token.""" - if isinstance(token, tokens.Text): - return Text(token.text) - elif isinstance(token, tokens.TemplateOpen): - return self._handle_template() - elif isinstance(token, tokens.ArgumentOpen): - return self._handle_argument() - elif isinstance(token, tokens.WikilinkOpen): - return self._handle_wikilink() - elif isinstance(token, tokens.ExternalLinkOpen): - return self._handle_external_link(token) - elif isinstance(token, tokens.HTMLEntityStart): - return self._handle_entity() - elif isinstance(token, tokens.HeadingStart): - return self._handle_heading(token) - elif isinstance(token, tokens.CommentStart): - return self._handle_comment() - elif isinstance(token, tokens.TagOpenOpen): - return self._handle_tag(token) - err = "_handle_token() got unexpected {0}".format(type(token).__name__) - raise ParserError(err) + try: + return _HANDLERS[type(token)](self, token) + except KeyError: + err = "_handle_token() got unexpected {0}" + raise ParserError(err.format(type(token).__name__)) def build(self, tokenlist): """Build a Wikicode object from a list tokens and return it.""" @@ -277,3 +292,6 @@ class Builder(object): node = self._handle_token(self._tokens.pop()) self._write(node) return self._pop() + + +del _add_handler diff --git a/tests/test_builder.py b/tests/test_builder.py index 8f71ede..d4e6f73 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -421,11 +421,22 @@ class TestBuilder(TreeEqualityTestCase): named=True)]))])]) self.assertWikicodeEqual(valid, self.builder.build(test)) - def test_parser_error(self): + def test_parser_errors(self): """test whether ParserError gets thrown for bad input""" + missing_closes = [ + [tokens.TemplateOpen(), tokens.TemplateParamSeparator()], + [tokens.TemplateOpen()], [tokens.ArgumentOpen()], + [tokens.WikilinkOpen()], [tokens.ExternalLinkOpen()], + [tokens.HeadingStart()], [tokens.CommentStart()], + [tokens.TagOpenOpen(), tokens.TagAttrStart()], + [tokens.TagOpenOpen()] + ] + func = self.assertRaisesRegex if py3k else self.assertRaisesRegexp msg = r"_handle_token\(\) got unexpected TemplateClose" func(ParserError, msg, self.builder.build, [tokens.TemplateClose()]) + for test in missing_closes: + self.assertRaises(ParserError, self.builder.build, test) if __name__ == "__main__": unittest.main(verbosity=2) From ded89fb14ef4ebcf5c493e61f794ad8ac0288ec5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 5 Jul 2014 19:27:26 -0400 Subject: [PATCH 032/102] Add a few unit tests for untested code; remove a useless conditional. --- mwparserfromhell/parser/tokenizer.c | 5 +---- mwparserfromhell/parser/tokenizer.py | 5 +---- tests/tokenizer/comments.mwtest | 7 +++++++ tests/tokenizer/external_links.mwtest | 4 ++-- tests/tokenizer/integration.mwtest | 7 +++++++ tests/tokenizer/tags_wikimarkup.mwtest | 7 +++++++ 6 files changed, 25 insertions(+), 10 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 4c6414e..d62b965 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2508,10 +2508,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) } else if (context & LC_FAIL_ON_RBRACE) { if (data == '}') { - if (context & LC_TEMPLATE) - self->topstack->context |= LC_FAIL_ON_EQUALS; - else - self->topstack->context |= LC_FAIL_NEXT; + self->topstack->context |= LC_FAIL_NEXT; return 0; } self->topstack->context ^= LC_FAIL_ON_RBRACE; diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 4422b5c..09eb799 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1040,10 +1040,7 @@ class Tokenizer(object): self._context ^= contexts.FAIL_ON_LBRACE elif context & contexts.FAIL_ON_RBRACE: if this == "}": - if context & contexts.TEMPLATE: - self._context |= contexts.FAIL_ON_EQUALS - else: - self._context |= contexts.FAIL_NEXT + self._context |= contexts.FAIL_NEXT return True self._context ^= contexts.FAIL_ON_RBRACE elif this == "{": diff --git a/tests/tokenizer/comments.mwtest b/tests/tokenizer/comments.mwtest index ea2e89f..4bf82a9 100644 --- a/tests/tokenizer/comments.mwtest +++ b/tests/tokenizer/comments.mwtest @@ -37,3 +37,10 @@ name: incomplete_partial_close label: a comment that doesn't close, with a partial close input: "bingo" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo."), TemplateOpen(), Text(text="bar"), TemplateClose(), Text(text="baz.&biz;"), CommentStart(), Text(text="hello"), CommentEnd(), Text(text="bingo"), ExternalLinkClose()] diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index feff9c5..04f617a 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -244,6 +244,13 @@ output: [Text(text="''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagClos --- +name: unending_bold_and_italics +label: five ticks (bold and italics) that don't end +input: "'''''testing" +output: [Text(text="'''''testing")] + +--- + name: complex_ul label: ul with a lot in it input: "* this is a test of an [[Unordered list|ul]] with {{plenty|of|stuff}}" From 871d48c688bf8133f886e1d84de7de536e252ae6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 10 Jul 2014 19:20:48 -0400 Subject: [PATCH 033/102] Solve a couple more coverage issues; tighten. --- .coveragerc | 1 + mwparserfromhell/parser/tokenizer.py | 8 ++++++-- tests/tokenizer/templates.mwtest | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/.coveragerc b/.coveragerc index 909a0e2..48a64ce 100644 --- a/.coveragerc +++ b/.coveragerc @@ -6,3 +6,4 @@ partial_branches = pragma: no branch if py3k: if not py3k: + if py26: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 09eb799..d867234 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -369,9 +369,11 @@ class Tokenizer(object): if "(" in this and ")" in punct: punct = punct[:-1] # ')' is not longer valid punctuation if this.endswith(punct): - for i in reversed(range(-len(this), 0)): - if i == -len(this) or this[i - 1] not in punct: + for i in range(len(this) - 1, 0, -1): + if this[i - 1] not in punct: break + else: + i = 0 stripped = this[:i] if stripped and tail: self._emit_text(tail) @@ -762,6 +764,8 @@ class Tokenizer(object): depth -= 1 if depth == 0: break + else: # pragma: no cover (untestable/exceptional case) + raise ParserError("_handle_single_tag_end() missed a TagCloseOpen") padding = stack[index].padding stack[index] = tokens.TagCloseSelfclose(padding=padding, implicit=True) return self._pop() diff --git a/tests/tokenizer/templates.mwtest b/tests/tokenizer/templates.mwtest index 78d7883..ff8a308 100644 --- a/tests/tokenizer/templates.mwtest +++ b/tests/tokenizer/templates.mwtest @@ -376,6 +376,20 @@ output: [Text(text="{{\nfoo\n|\n{{\nb\nar\n|\nb\naz\n=\nb\niz\n}}\n=\nb\nuzz\n}} --- +name: newlines_spaces +label: newlines in the middle of a template name, followed by spaces +input: "{{foo\n }}" +output: [TemplateOpen(), Text(text="foo\n "), TemplateClose()] + +--- + +name: newlines_spaces_param +label: newlines in the middle of a template name, followed by spaces +input: "{{foo\n }}" +output: [TemplateOpen(), Text(text="foo\n "), TemplateClose()] + +--- + name: invalid_name_left_brace_middle label: invalid characters in template name: left brace in middle input: "{{foo{bar}}" From fc529bdb57282d04cf6a660671a06489dcfe23a6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 10 Jul 2014 19:48:12 -0400 Subject: [PATCH 034/102] Add unit tests for #59. --- tests/tokenizer/integration.mwtest | 35 +++++++++++++++++++++++++++++++++++ tests/tokenizer/templates.mwtest | 6 +++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index c16fe53..ef6d5c5 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -192,3 +192,38 @@ name: nodes_inside_external_link_after_punct label: various complex nodes inside an external link following punctuation input: "http://example.com/foo.{{bar}}baz.&biz;bingo" output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo."), TemplateOpen(), Text(text="bar"), TemplateClose(), Text(text="baz.&biz;"), CommentStart(), Text(text="hello"), CommentEnd(), Text(text="bingo"), ExternalLinkClose()] + +--- + +name: newline_and_comment_in_template_name +label: a template name containing a newline followed by a comment +input: "{{foobar\n}}" +output: [TemplateOpen(), Text(text="foobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), TemplateClose()] + +--- + +name: newline_and_comment_in_template_name_2 +label: a template name containing a newline followed by a comment +input: "{{foobar\n|key=value}}" +output: [TemplateOpen(), Text(text="foobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), TemplateParamSeparator(), Text(text="key"), TemplateParamEquals(), Text(text="value"), TemplateClose()] + +--- + +name: newline_and_comment_in_template_name_3 +label: a template name containing a newline followed by a comment +input: "{{foobar\n\n|key=value}}" +output: [TemplateOpen(), Text(text="foobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), Text(text="\n"), TemplateParamSeparator(), Text(text="key"), TemplateParamEquals(), Text(text="value"), TemplateClose()] + +--- + +name: newline_and_comment_in_template_name_4 +label: a template name containing a newline followed by a comment +input: "{{foobar\ninvalid|key=value}}" +output: [Text(text="{{foobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), Text(text="invalid|key=value}}")] + +--- + +name: newline_and_comment_in_template_name_5 +label: a template name containing a newline followed by a comment +input: "{{foobar\n\ninvalid|key=value}}" +output: [Text(text="{{foobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), Text(text="\ninvalid|key=value}}")] diff --git a/tests/tokenizer/templates.mwtest b/tests/tokenizer/templates.mwtest index ff8a308..25e178a 100644 --- a/tests/tokenizer/templates.mwtest +++ b/tests/tokenizer/templates.mwtest @@ -384,9 +384,9 @@ output: [TemplateOpen(), Text(text="foo\n "), TemplateClose()] --- name: newlines_spaces_param -label: newlines in the middle of a template name, followed by spaces -input: "{{foo\n }}" -output: [TemplateOpen(), Text(text="foo\n "), TemplateClose()] +label: newlines in the middle of a template name, followed by spaces, with a parameter +input: "{{foo\n |bar=baz}}" +output: [TemplateOpen(), Text(text="foo\n "), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose()] --- From 6954480263b537c775c960f9b64e3a9cd4706481 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 10 Jul 2014 20:17:45 -0400 Subject: [PATCH 035/102] Fix template parsing when comments are inside the name (fixes #59). --- mwparserfromhell/parser/tokenizer.c | 17 ++++++++++++++--- mwparserfromhell/parser/tokenizer.py | 8 ++++++++ tests/tokenizer/integration.mwtest | 14 ++++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index d62b965..814ad50 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1553,6 +1553,12 @@ static int Tokenizer_parse_comment(Tokenizer* self) return -1; Py_DECREF(comment); self->head += 2; + if (self->topstack->context & LC_FAIL_NEXT) { + /* _verify_safe() sets this flag while parsing a template name + when it encounters what might be a comment -- we must unset + it to let _verify_safe() know it was correct: */ + self->topstack->context ^= LC_FAIL_NEXT; + } return 0; } if (Tokenizer_emit_char(self, this)) @@ -2478,8 +2484,13 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) return 0; if (context & LC_HAS_TEXT) { if (context & LC_FAIL_ON_TEXT) { - if (!Py_UNICODE_ISSPACE(data)) + if (!Py_UNICODE_ISSPACE(data)) { + if (data == '<' && Tokenizer_READ(self, 1) == '!') { + self->topstack->context |= LC_FAIL_NEXT; + return 0; + } return -1; + } } else { if (data == '\n') @@ -2496,8 +2507,8 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) } } else if (context & LC_FAIL_ON_LBRACE) { - if (data == '{' || (Tokenizer_READ(self, -1) == '{' && - Tokenizer_READ(self, -2) == '{')) { + if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' && + Tokenizer_READ_BACKWARDS(self, 2) == '{')) { if (context & LC_TEMPLATE) self->topstack->context |= LC_FAIL_ON_EQUALS; else diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d867234..44f0d60 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -608,6 +608,11 @@ class Tokenizer(object): self._emit(tokens.CommentEnd()) self._emit_all(self._pop()) self._head += 2 + if self._context & contexts.FAIL_NEXT: + # _verify_safe() sets this flag while parsing a template + # name when it encounters what might be a comment -- we + # must unset it to let _verify_safe() know it was correct: + self._context ^= contexts.FAIL_NEXT return self._emit_text(this) self._head += 1 @@ -1021,6 +1026,9 @@ class Tokenizer(object): if context & contexts.HAS_TEXT: if context & contexts.FAIL_ON_TEXT: if this is self.END or not this.isspace(): + if this == "<" and self._read(1) == "!": + self._context |= contexts.FAIL_NEXT + return True return False else: if this == "\n": diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index ef6d5c5..1019175 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -227,3 +227,17 @@ name: newline_and_comment_in_template_name_5 label: a template name containing a newline followed by a comment input: "{{foobar\n\ninvalid|key=value}}" output: [Text(text="{{foobar\n"), CommentStart(), Text(text=" comment "), CommentEnd(), Text(text="\ninvalid|key=value}}")] + +--- + +name: newline_and_comment_in_template_name_6 +label: a template name containing a newline followed by a comment +input: "{{foobar\n``) and - :py:class:`Wikilinks <.Wikilink>` (``[[foo]]``). -- Added corresponding :py:meth:`.ifilter_links` and :py:meth:`.filter_links` - methods to :py:class:`.Wikicode`. +- Added support for :class:`Comments <.Comment>` (````) and + :class:`Wikilinks <.Wikilink>` (``[[foo]]``). +- Added corresponding :meth:`.ifilter_links` and :meth:`.filter_links` methods + to :class:`.Wikicode`. - Fixed a bug when parsing incomplete templates. -- Fixed :py:meth:`.strip_code` to affect the contents of headings. +- Fixed :meth:`.strip_code` to affect the contents of headings. - Various copyedits in documentation and comments. v0.1 diff --git a/docs/index.rst b/docs/index.rst index a6d2df3..988f5e7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,9 +1,9 @@ MWParserFromHell v\ |version| Documentation =========================================== -:py:mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python -package that provides an easy-to-use and outrageously powerful parser for -MediaWiki_ wikicode. It supports Python 2 and Python 3. +:mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python package +that provides an easy-to-use and outrageously powerful parser for MediaWiki_ +wikicode. It supports Python 2 and Python 3. Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. Development occurs on GitHub_. diff --git a/docs/integration.rst b/docs/integration.rst index a09334d..102b3b9 100644 --- a/docs/integration.rst +++ b/docs/integration.rst @@ -1,11 +1,11 @@ Integration =========== -:py:mod:`mwparserfromhell` is used by and originally developed for EarwigBot_; -:py:class:`~earwigbot.wiki.page.Page` objects have a -:py:meth:`~earwigbot.wiki.page.Page.parse` method that essentially calls -:py:func:`mwparserfromhell.parse() ` on -:py:meth:`~earwigbot.wiki.page.Page.get`. +:mod:`mwparserfromhell` is used by and originally developed for EarwigBot_; +:class:`~earwigbot.wiki.page.Page` objects have a +:meth:`~earwigbot.wiki.page.Page.parse` method that essentially calls +:func:`mwparserfromhell.parse() ` on +:meth:`~earwigbot.wiki.page.Page.get`. If you're using Pywikipedia_, your code might look like this:: diff --git a/docs/usage.rst b/docs/usage.rst index 974c670..c471397 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -6,9 +6,9 @@ Normal usage is rather straightforward (where ``text`` is page text):: >>> import mwparserfromhell >>> wikicode = mwparserfromhell.parse(text) -``wikicode`` is a :py:class:`mwparserfromhell.Wikicode <.Wikicode>` object, -which acts like an ordinary ``unicode`` object (or ``str`` in Python 3) with -some extra methods. For example:: +``wikicode`` is a :class:`mwparserfromhell.Wikicode <.Wikicode>` object, which +acts like an ordinary ``unicode`` object (or ``str`` in Python 3) with some +extra methods. For example:: >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" >>> wikicode = mwparserfromhell.parse(text) @@ -33,9 +33,9 @@ Since nodes can contain other nodes, getting nested templates is trivial:: >>> mwparserfromhell.parse(text).filter_templates() ['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}'] -You can also pass *recursive=False* to :py:meth:`~.filter_templates` and -explore templates manually. This is possible because nodes can contain -additional :py:class:`~.Wikicode` objects:: +You can also pass *recursive=False* to :meth:`.filter_templates` and explore +templates manually. This is possible because nodes can contain additional +:class:`.Wikicode` objects:: >>> code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}") >>> print code.filter_templates(recursive=False) @@ -49,11 +49,11 @@ additional :py:class:`~.Wikicode` objects:: template Templates can be easily modified to add, remove, or alter params. -:py:class:`~.Wikicode` objects can be treated like lists, with -:py:meth:`~.Wikicode.append`, :py:meth:`~.Wikicode.insert`, -:py:meth:`~.Wikicode.remove`, :py:meth:`~.Wikicode.replace`, and more. They -also have a :py:meth:`~.Wikicode.matches` method for comparing page or template -names, which takes care of capitalization and whitespace:: +:class:`.Wikicode` objects can be treated like lists, with +:meth:`~.Wikicode.append`, :meth:`~.Wikicode.insert`, +:meth:`~.Wikicode.remove`, :meth:`~.Wikicode.replace`, and more. They also have +a :meth:`~.Wikicode.matches` method for comparing page or template names, which +takes care of capitalization and whitespace:: >>> text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}" >>> code = mwparserfromhell.parse(text) @@ -69,8 +69,8 @@ names, which takes care of capitalization and whitespace:: >>> print code.filter_templates() ['{{cleanup|date=July 2012}}', '{{bar-stub}}'] -You can then convert ``code`` back into a regular :py:class:`unicode` object -(for saving the page!) by calling :py:func:`unicode` on it:: +You can then convert ``code`` back into a regular :class:`unicode` object (for +saving the page!) by calling :func:`unicode` on it:: >>> text = unicode(code) >>> print text @@ -78,7 +78,7 @@ You can then convert ``code`` back into a regular :py:class:`unicode` object >>> text == code True -(Likewise, use :py:func:`str(code) ` in Python 3.) +(Likewise, use :func:`str(code) ` in Python 3.) -For more tips, check out :py:class:`Wikicode's full method list <.Wikicode>` -and the :py:mod:`list of Nodes <.nodes>`. +For more tips, check out :class:`Wikicode's full method list <.Wikicode>` and +the :mod:`list of Nodes <.nodes>`. From 87e0079512f3d85813541dc97a240713fc0b33c9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 11 Jul 2014 00:30:47 -0400 Subject: [PATCH 039/102] Take proper advantage of Sphinx's default domains. --- mwparserfromhell/compat.py | 8 +- mwparserfromhell/nodes/__init__.py | 26 ++-- mwparserfromhell/nodes/external_link.py | 4 +- mwparserfromhell/nodes/extras/__init__.py | 5 +- mwparserfromhell/nodes/extras/attribute.py | 6 +- mwparserfromhell/nodes/extras/parameter.py | 4 +- mwparserfromhell/nodes/heading.py | 2 +- mwparserfromhell/nodes/tag.py | 26 ++-- mwparserfromhell/nodes/template.py | 32 ++--- mwparserfromhell/nodes/wikilink.py | 4 +- mwparserfromhell/parser/__init__.py | 36 +++--- mwparserfromhell/parser/builder.py | 10 +- mwparserfromhell/parser/contexts.py | 88 +++++++------- mwparserfromhell/parser/tokenizer.py | 12 +- mwparserfromhell/parser/tokens.py | 4 +- mwparserfromhell/smart_list.py | 22 ++-- mwparserfromhell/string_mixin.py | 11 +- mwparserfromhell/utils.py | 20 ++-- mwparserfromhell/wikicode.py | 184 ++++++++++++++--------------- 19 files changed, 248 insertions(+), 256 deletions(-) diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py index 4384ace..590a271 100644 --- a/mwparserfromhell/compat.py +++ b/mwparserfromhell/compat.py @@ -2,10 +2,10 @@ """ Implements support for both Python 2 and Python 3 by defining common types in -terms of their Python 2/3 variants. For example, :py:class:`str` is set to -:py:class:`unicode` on Python 2 but :py:class:`str` on Python 3; likewise, -:py:class:`bytes` is :py:class:`str` on 2 but :py:class:`bytes` on 3. These -types are meant to be imported directly from within the parser's modules. +terms of their Python 2/3 variants. For example, :class:`str` is set to +:class:`unicode` on Python 2 but :class:`str` on Python 3; likewise, +:class:`bytes` is :class:`str` on 2 but :class:`bytes` on 3. These types are +meant to be imported directly from within the parser's modules. """ import sys diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index d6f60bd..8e71c8b 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -21,12 +21,12 @@ # SOFTWARE. """ -This package contains :py:class:`~.Wikicode` "nodes", which represent a single -unit of wikitext, such as a Template, an HTML tag, a Heading, or plain text. -The node "tree" is far from flat, as most types can contain additional -:py:class:`~.Wikicode` types within them - and with that, more nodes. For -example, the name of a :py:class:`~.Template` is a :py:class:`~.Wikicode` -object that can contain text or more templates. +This package contains :class:`.Wikicode` "nodes", which represent a single unit +of wikitext, such as a Template, an HTML tag, a Heading, or plain text. The +node "tree" is far from flat, as most types can contain additional +:class:`.Wikicode` types within them - and with that, more nodes. For example, +the name of a :class:`.Template` is a :class:`.Wikicode` object that can +contain text or more templates. """ from __future__ import unicode_literals @@ -40,16 +40,16 @@ __all__ = ["Node", "Text", "Argument", "Heading", "HTMLEntity", "Tag", class Node(StringMixIn): """Represents the base Node type, demonstrating the methods to override. - :py:meth:`__unicode__` must be overridden. It should return a ``unicode`` - or (``str`` in py3k) representation of the node. If the node contains - :py:class:`~.Wikicode` objects inside of it, :py:meth:`__children__` - should be a generator that iterates over them. If the node is printable - (shown when the page is rendered), :py:meth:`__strip__` should return its + :meth:`__unicode__` must be overridden. It should return a ``unicode`` or + (``str`` in py3k) representation of the node. If the node contains + :class:`.Wikicode` objects inside of it, :meth:`__children__` should be a + generator that iterates over them. If the node is printable + (shown when the page is rendered), :meth:`__strip__` should return its printable version, stripping out any formatting marks. It does not have to return a string, but something that can be converted to a string with - ``str()``. Finally, :py:meth:`__showtree__` can be overridden to build a + ``str()``. Finally, :meth:`__showtree__` can be overridden to build a nice tree representation of the node, if desired, for - :py:meth:`~.Wikicode.get_tree`. + :meth:`~.Wikicode.get_tree`. """ def __unicode__(self): raise NotImplementedError() diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index d13376e..f98a1e5 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -67,12 +67,12 @@ class ExternalLink(Node): @property def url(self): - """The URL of the link target, as a :py:class:`~.Wikicode` object.""" + """The URL of the link target, as a :class:`.Wikicode` object.""" return self._url @property def title(self): - """The link title (if given), as a :py:class:`~.Wikicode` object.""" + """The link title (if given), as a :class:`.Wikicode` object.""" return self._title @property diff --git a/mwparserfromhell/nodes/extras/__init__.py b/mwparserfromhell/nodes/extras/__init__.py index a131269..7c0262b 100644 --- a/mwparserfromhell/nodes/extras/__init__.py +++ b/mwparserfromhell/nodes/extras/__init__.py @@ -21,9 +21,8 @@ # SOFTWARE. """ -This package contains objects used by -:py:class:`~.Node`\ s, but are not nodes themselves. This includes the -parameters of Templates or the attributes of HTML tags. +This package contains objects used by :class:`.Node`\ s, but that are not nodes +themselves. This includes template parameters and HTML tag attributes. """ from .attribute import Attribute diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 6256138..cb50194 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -31,7 +31,7 @@ __all__ = ["Attribute"] class Attribute(StringMixIn): """Represents an attribute of an HTML tag. - This is used by :py:class:`~.Tag` objects. For example, the tag + This is used by :class:`.Tag` objects. For example, the tag ```` contains an Attribute whose name is ``"name"`` and whose value is ``"foo"``. """ @@ -84,12 +84,12 @@ class Attribute(StringMixIn): @property def name(self): - """The name of the attribute as a :py:class:`~.Wikicode` object.""" + """The name of the attribute as a :class:`.Wikicode` object.""" return self._name @property def value(self): - """The value of the attribute as a :py:class:`~.Wikicode` object.""" + """The value of the attribute as a :class:`.Wikicode` object.""" return self._value @property diff --git a/mwparserfromhell/nodes/extras/parameter.py b/mwparserfromhell/nodes/extras/parameter.py index 5a67ae0..50c9ac0 100644 --- a/mwparserfromhell/nodes/extras/parameter.py +++ b/mwparserfromhell/nodes/extras/parameter.py @@ -58,12 +58,12 @@ class Parameter(StringMixIn): @property def name(self): - """The name of the parameter as a :py:class:`~.Wikicode` object.""" + """The name of the parameter as a :class:`.Wikicode` object.""" return self._name @property def value(self): - """The value of the parameter as a :py:class:`~.Wikicode` object.""" + """The value of the parameter as a :class:`.Wikicode` object.""" return self._value @property diff --git a/mwparserfromhell/nodes/heading.py b/mwparserfromhell/nodes/heading.py index 47c23a8..696b5ee 100644 --- a/mwparserfromhell/nodes/heading.py +++ b/mwparserfromhell/nodes/heading.py @@ -52,7 +52,7 @@ class Heading(Node): @property def title(self): - """The title of the heading, as a :py:class:`~.Wikicode` object.""" + """The title of the heading, as a :class:`.Wikicode` object.""" return self._title @property diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 1b8efb8..7cbe78d 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -108,19 +108,19 @@ class Tag(Node): @property def tag(self): - """The tag itself, as a :py:class:`~.Wikicode` object.""" + """The tag itself, as a :class:`.Wikicode` object.""" return self._tag @property def contents(self): - """The contents of the tag, as a :py:class:`~.Wikicode` object.""" + """The contents of the tag, as a :class:`.Wikicode` object.""" return self._contents @property def attributes(self): """The list of attributes affecting the tag. - Each attribute is an instance of :py:class:`~.Attribute`. + Each attribute is an instance of :class:`.Attribute`. """ return self._attrs @@ -146,7 +146,7 @@ class Tag(Node): This makes the tag look like a lone close tag. It is technically invalid and is only parsable Wikicode when the tag itself is single-only, like ``
    `` and ````. See - :py:func:`.definitions.is_single_only`. + :func:`.definitions.is_single_only`. """ return self._invalid @@ -155,8 +155,8 @@ class Tag(Node): """Whether the tag is implicitly self-closing, with no ending slash. This is only possible for specific "single" tags like ``
    `` and - ``
  • ``. See :py:func:`.definitions.is_single`. This field only has an - effect if :py:attr:`self_closing` is also ``True``. + ``
  • ``. See :func:`.definitions.is_single`. This field only has an + effect if :attr:`self_closing` is also ``True``. """ return self._implicit @@ -167,9 +167,9 @@ class Tag(Node): @property def closing_tag(self): - """The closing tag, as a :py:class:`~.Wikicode` object. + """The closing tag, as a :class:`.Wikicode` object. - This will usually equal :py:attr:`tag`, unless there is additional + This will usually equal :attr:`tag`, unless there is additional spacing, comments, or the like. """ return self._closing_tag @@ -226,8 +226,8 @@ class Tag(Node): def get(self, name): """Get the attribute with the given *name*. - The returned object is a :py:class:`~.Attribute` instance. Raises - :py:exc:`ValueError` if no attribute has this name. Since multiple + The returned object is a :class:`.Attribute` instance. Raises + :exc:`ValueError` if no attribute has this name. Since multiple attributes can have the same name, we'll return the last match, since all but the last are ignored by the MediaWiki parser. """ @@ -241,9 +241,9 @@ class Tag(Node): """Add an attribute with the given *name* and *value*. *name* and *value* can be anything parsable by - :py:func:`.utils.parse_anything`; *value* can be omitted if the - attribute is valueless. If *quotes* is not ``None``, it should be a - string (either ``"`` or ``'``) that *value* will be wrapped in (this is + :func:`.utils.parse_anything`; *value* can be omitted if the attribute + is valueless. If *quotes* is not ``None``, it should be a string + (either ``"`` or ``'``) that *value* will be wrapped in (this is recommended). ``None`` is only legal if *value* contains no spacing. *pad_first*, *pad_before_eq*, and *pad_after_eq* are whitespace used as diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index c0fda5d..a9b14aa 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -110,8 +110,8 @@ class Template(Node): """Try to determine the whitespace conventions for parameters. This will examine the existing parameters and use - :py:meth:`_select_theory` to determine if there are any preferred - styles for how much whitespace to put before or after the value. + :meth:`_select_theory` to determine if there are any preferred styles + for how much whitespace to put before or after the value. """ before_theories = defaultdict(lambda: 0) after_theories = defaultdict(lambda: 0) @@ -159,7 +159,7 @@ class Template(Node): @property def name(self): - """The name of the template, as a :py:class:`~.Wikicode` object.""" + """The name of the template, as a :class:`.Wikicode` object.""" return self._name @property @@ -189,13 +189,13 @@ class Template(Node): has_param = lambda self, name, ignore_empty=False: \ self.has(name, ignore_empty) - has_param.__doc__ = "Alias for :py:meth:`has`." + has_param.__doc__ = "Alias for :meth:`has`." def get(self, name): """Get the parameter whose name is *name*. - The returned object is a :py:class:`~.Parameter` instance. Raises - :py:exc:`ValueError` if no parameter has this name. Since multiple + The returned object is a :class:`.Parameter` instance. Raises + :exc:`ValueError` if no parameter has this name. Since multiple parameters can have the same name, we'll return the last match, since the last parameter is the only one read by the MediaWiki parser. """ @@ -210,8 +210,8 @@ class Template(Node): """Add a parameter to the template with a given *name* and *value*. *name* and *value* can be anything parsable by - :py:func:`.utils.parse_anything`; pipes and equal signs are - automatically escaped from *value* when appropriate. + :func:`.utils.parse_anything`; pipes and equal signs are automatically + escaped from *value* when appropriate. If *showkey* is given, this will determine whether or not to show the parameter's name (e.g., ``{{foo|bar}}``'s parameter has a name of @@ -221,13 +221,13 @@ class Template(Node): If *name* is already a parameter in the template, we'll replace its value while keeping the same whitespace around it. We will also try to guess the dominant spacing convention when adding a new parameter using - :py:meth:`_get_spacing_conventions`. + :meth:`_get_spacing_conventions`. - If *before* is given (either a :py:class:`~.Parameter` object or a - name), then we will place the parameter immediately before this one. + If *before* is given (either a :class:`.Parameter` object or a name), + then we will place the parameter immediately before this one. Otherwise, it will be added at the end. If *before* is a name and exists multiple times in the template, we will place it before the last - occurrence. If *before* is not in the template, :py:exc:`ValueError` is + occurrence. If *before* is not in the template, :exc:`ValueError` is raised. The argument is ignored if the new parameter already exists. If *preserve_spacing* is ``False``, we will avoid preserving spacing @@ -289,9 +289,9 @@ class Template(Node): def remove(self, param, keep_field=False): """Remove a parameter from the template, identified by *param*. - If *param* is a :py:class:`.Parameter` object, it will be matched - exactly, otherwise it will be treated like the *name* argument to - :py:meth:`has` and :py:meth:`get`. + If *param* is a :class:`.Parameter` object, it will be matched exactly, + otherwise it will be treated like the *name* argument to :meth:`has` + and :meth:`get`. If *keep_field* is ``True``, we will keep the parameter's name, but blank its value. Otherwise, we will remove the parameter completely @@ -300,7 +300,7 @@ class Template(Node): we expected, so ``{{foo||baz}}`` will be produced instead). If the parameter shows up multiple times in the template and *param* is - not a :py:class:`.Parameter` object, we will remove all instances of it + not a :class:`.Parameter` object, we will remove all instances of it (and keep only one if *keep_field* is ``True`` - the first instance if none have dependents, otherwise the one with dependents will be kept). """ diff --git a/mwparserfromhell/nodes/wikilink.py b/mwparserfromhell/nodes/wikilink.py index 4640f34..f9c221c 100644 --- a/mwparserfromhell/nodes/wikilink.py +++ b/mwparserfromhell/nodes/wikilink.py @@ -62,12 +62,12 @@ class Wikilink(Node): @property def title(self): - """The title of the linked page, as a :py:class:`~.Wikicode` object.""" + """The title of the linked page, as a :class:`.Wikicode` object.""" return self._title @property def text(self): - """The text to display (if any), as a :py:class:`~.Wikicode` object.""" + """The text to display (if any), as a :class:`.Wikicode` object.""" return self._text @title.setter diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 467d5df..36cb511 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -22,8 +22,8 @@ """ This package contains the actual wikicode parser, split up into two main -modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module -joins them together under one interface. +modules: the :mod:`.tokenizer` and the :mod:`.builder`. This module joins them +together into one interface. """ class ParserError(Exception): @@ -54,16 +54,16 @@ class Parser(object): """Represents a parser for wikicode. Actual parsing is a two-step process: first, the text is split up into a - series of tokens by the :py:class:`.Tokenizer`, and then the tokens are - converted into trees of :py:class:`.Wikicode` objects and - :py:class:`.Node`\ s by the :py:class:`.Builder`. + series of tokens by the :class:`.Tokenizer`, and then the tokens are + converted into trees of :class:`.Wikicode` objects and :class:`.Node`\ s by + the :class:`.Builder`. - Instances of this class or its dependents (:py:class:`.Tokenizer` and - :py:class:`.Builder`) should not be shared between threads. - :py:meth:`parse` can be called multiple times as long as it is not done - concurrently. In general, there is no need to do this because parsing - should be done through :py:func:`mwparserfromhell.parse`, which creates a - new :py:class:`.Parser` object as necessary. + Instances of this class or its dependents (:class:`.Tokenizer` and + :class:`.Builder`) should not be shared between threads. :meth:`parse` can + be called multiple times as long as it is not done concurrently. In + general, there is no need to do this because parsing should be done through + :func:`mwparserfromhell.parse`, which creates a new :class:`.Parser` object + as necessary. """ def __init__(self): @@ -74,20 +74,20 @@ class Parser(object): self._builder = Builder() def parse(self, text, context=0, skip_style_tags=False): - """Parse *text*, returning a :py:class:`~.Wikicode` object tree. + """Parse *text*, returning a :class:`.Wikicode` object tree. If given, *context* will be passed as a starting context to the parser. This is helpful when this function is used inside node attribute - setters. For example, :py:class:`~.ExternalLink`\ 's - :py:attr:`~.ExternalLink.url` setter sets *context* to - :py:mod:`contexts.EXT_LINK_URI <.contexts>` to prevent the URL itself - from becoming an :py:class:`~.ExternalLink`. + setters. For example, :class:`.ExternalLink`\ 's + :attr:`~.ExternalLink.url` setter sets *context* to + :mod:`contexts.EXT_LINK_URI <.contexts>` to prevent the URL itself + from becoming an :class:`.ExternalLink`. If *skip_style_tags* is ``True``, then ``''`` and ``'''`` will not be parsed, but instead will be treated as plain text. - If there is an internal error while parsing, :py:exc:`.ParserError` - will be raised. + If there is an internal error while parsing, :exc:`.ParserError` will + be raised. """ tokens = self._tokenizer.tokenize(text, context, skip_style_tags) code = self._builder.build(tokens) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index e0109e6..2d68036 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -48,9 +48,9 @@ def _add_handler(token_type): class Builder(object): """Builds a tree of nodes out of a sequence of tokens. - To use, pass a list of :py:class:`~.Token`\ s to the :py:meth:`build` - method. The list will be exhausted as it is parsed and a - :py:class:`.Wikicode` object containing the node tree will be returned. + To use, pass a list of :class:`.Token`\ s to the :meth:`build` method. The + list will be exhausted as it is parsed and a :class:`.Wikicode` object + containing the node tree will be returned. """ def __init__(self): @@ -64,8 +64,8 @@ class Builder(object): def _pop(self): """Pop the current node list off of the stack. - The raw node list is wrapped in a :py:class:`.SmartList` and then in a - :py:class:`.Wikicode` object. + The raw node list is wrapped in a :class:`.SmartList` and then in a + :class:`.Wikicode` object. """ return Wikicode(SmartList(self._stacks.pop())) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 28023b5..f568fac 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -35,72 +35,72 @@ will cover ``BAR == 0b10`` and ``BAZ == 0b01``). Local (stack-specific) contexts: -* :py:const:`TEMPLATE` +* :const:`TEMPLATE` - * :py:const:`TEMPLATE_NAME` - * :py:const:`TEMPLATE_PARAM_KEY` - * :py:const:`TEMPLATE_PARAM_VALUE` + * :const:`TEMPLATE_NAME` + * :const:`TEMPLATE_PARAM_KEY` + * :const:`TEMPLATE_PARAM_VALUE` -* :py:const:`ARGUMENT` +* :const:`ARGUMENT` - * :py:const:`ARGUMENT_NAME` - * :py:const:`ARGUMENT_DEFAULT` + * :const:`ARGUMENT_NAME` + * :const:`ARGUMENT_DEFAULT` -* :py:const:`WIKILINK` +* :const:`WIKILINK` - * :py:const:`WIKILINK_TITLE` - * :py:const:`WIKILINK_TEXT` + * :const:`WIKILINK_TITLE` + * :const:`WIKILINK_TEXT` -* :py:const:`EXT_LINK` +* :const:`EXT_LINK` - * :py:const:`EXT_LINK_URI` - * :py:const:`EXT_LINK_TITLE` + * :const:`EXT_LINK_URI` + * :const:`EXT_LINK_TITLE` -* :py:const:`HEADING` +* :const:`HEADING` - * :py:const:`HEADING_LEVEL_1` - * :py:const:`HEADING_LEVEL_2` - * :py:const:`HEADING_LEVEL_3` - * :py:const:`HEADING_LEVEL_4` - * :py:const:`HEADING_LEVEL_5` - * :py:const:`HEADING_LEVEL_6` + * :const:`HEADING_LEVEL_1` + * :const:`HEADING_LEVEL_2` + * :const:`HEADING_LEVEL_3` + * :const:`HEADING_LEVEL_4` + * :const:`HEADING_LEVEL_5` + * :const:`HEADING_LEVEL_6` -* :py:const:`TAG` +* :const:`TAG` - * :py:const:`TAG_OPEN` - * :py:const:`TAG_ATTR` - * :py:const:`TAG_BODY` - * :py:const:`TAG_CLOSE` + * :const:`TAG_OPEN` + * :const:`TAG_ATTR` + * :const:`TAG_BODY` + * :const:`TAG_CLOSE` -* :py:const:`STYLE` +* :const:`STYLE` - * :py:const:`STYLE_ITALICS` - * :py:const:`STYLE_BOLD` - * :py:const:`STYLE_PASS_AGAIN` - * :py:const:`STYLE_SECOND_PASS` + * :const:`STYLE_ITALICS` + * :const:`STYLE_BOLD` + * :const:`STYLE_PASS_AGAIN` + * :const:`STYLE_SECOND_PASS` -* :py:const:`DL_TERM` +* :const:`DL_TERM` -* :py:const:`SAFETY_CHECK` +* :const:`SAFETY_CHECK` - * :py:const:`HAS_TEXT` - * :py:const:`FAIL_ON_TEXT` - * :py:const:`FAIL_NEXT` - * :py:const:`FAIL_ON_LBRACE` - * :py:const:`FAIL_ON_RBRACE` - * :py:const:`FAIL_ON_EQUALS` + * :const:`HAS_TEXT` + * :const:`FAIL_ON_TEXT` + * :const:`FAIL_NEXT` + * :const:`FAIL_ON_LBRACE` + * :const:`FAIL_ON_RBRACE` + * :const:`FAIL_ON_EQUALS` Global contexts: -* :py:const:`GL_HEADING` +* :const:`GL_HEADING` Aggregate contexts: -* :py:const:`FAIL` -* :py:const:`UNSAFE` -* :py:const:`DOUBLE` -* :py:const:`NO_WIKILINKS` -* :py:const:`NO_EXT_LINKS` +* :const:`FAIL` +* :const:`UNSAFE` +* :const:`DOUBLE` +* :const:`NO_WIKILINKS` +* :const:`NO_EXT_LINKS` """ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 44f0d60..073e64c 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -135,7 +135,7 @@ class Tokenizer(object): """Fail the current tokenization route. Discards the current stack/context/textbuffer and raises - :py:exc:`~.BadRoute`. + :exc:`.BadRoute`. """ context = self._context self._pop() @@ -173,14 +173,14 @@ class Tokenizer(object): def _read(self, delta=0, wrap=False, strict=False): """Read the value at a relative point in the wikicode. - The value is read from :py:attr:`self._head <_head>` plus the value of + The value is read from :attr:`self._head <_head>` plus the value of *delta* (which can be negative). If *wrap* is ``False``, we will not allow attempts to read from the end of the string if ``self._head + delta`` is negative. If *strict* is ``True``, the route will be failed - (with :py:meth:`_fail_route`) if we try to read from past the end of - the string; otherwise, :py:attr:`self.END ` is returned. If we try - to read from before the start of the string, :py:attr:`self.START - ` is returned. + (with :meth:`_fail_route`) if we try to read from past the end of the + string; otherwise, :attr:`self.END ` is returned. If we try to + read from before the start of the string, :attr:`self.START ` is + returned. """ index = self._head + delta if index < 0 and (not wrap or abs(index) > len(self._text)): diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index e567731..2e38a1c 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -24,8 +24,8 @@ This module contains the token definitions that are used as an intermediate parsing data type - they are stored in a flat list, with each token being identified by its type and optional attributes. The token list is generated in -a syntactically valid form by the :py:class:`~.Tokenizer`, and then converted -into the :py:class`~.Wikicode` tree by the :py:class:`~.Builder`. +a syntactically valid form by the :class:`.Tokenizer`, and then converted into +the :class`.Wikicode` tree by the :class:`.Builder`. """ from __future__ import unicode_literals diff --git a/mwparserfromhell/smart_list.py b/mwparserfromhell/smart_list.py index cedfb5c..b4cfd1b 100644 --- a/mwparserfromhell/smart_list.py +++ b/mwparserfromhell/smart_list.py @@ -21,8 +21,8 @@ # SOFTWARE. """ -This module contains the :py:class:`~.SmartList` type, as well as its -:py:class:`~._ListProxy` child, which together implement a list whose sublists +This module contains the :class:`.SmartList` type, as well as its +:class:`._ListProxy` child, which together implement a list whose sublists reflect changes made to the main list, and vice-versa. """ @@ -35,7 +35,7 @@ __all__ = ["SmartList"] def inheritdoc(method): """Set __doc__ of *method* to __doc__ of *method* in its parent class. - Since this is used on :py:class:`~.SmartList`, the "parent class" used is + Since this is used on :class:`.SmartList`, the "parent class" used is ``list``. This function can be used as a decorator. """ method.__doc__ = getattr(list, method.__name__).__doc__ @@ -65,9 +65,9 @@ class SmartList(_SliceNormalizerMixIn, list): list (such as the addition, removal, or replacement of elements) will be reflected in the sublist, or vice-versa, to the greatest degree possible. This is implemented by having sublists - instances of the - :py:class:`~._ListProxy` type - dynamically determine their elements by - storing their slice info and retrieving that slice from the parent. Methods - that change the size of the list also change the slice info. For example:: + :class:`._ListProxy` type - dynamically determine their elements by storing + their slice info and retrieving that slice from the parent. Methods that + change the size of the list also change the slice info. For example:: >>> parent = SmartList([0, 1, 2, 3]) >>> parent @@ -84,8 +84,8 @@ class SmartList(_SliceNormalizerMixIn, list): The parent needs to keep a list of its children in order to update them, which prevents them from being garbage-collected. If you are keeping the parent around for a while but creating many children, it is advisable to - call :py:meth:`~._ListProxy.detach` when you're finished with them. Certain - parent methods, like :py:meth:`reverse` and :py:meth:`sort`, will do this + call :meth:`._ListProxy.detach` when you're finished with them. Certain + parent methods, like :meth:`reverse` and :meth:`sort`, will do this automatically. """ @@ -217,9 +217,9 @@ class SmartList(_SliceNormalizerMixIn, list): class _ListProxy(_SliceNormalizerMixIn, list): """Implement the ``list`` interface by getting elements from a parent. - This is created by a :py:class:`~.SmartList` object when slicing. It does - not actually store the list at any time; instead, whenever the list is - needed, it builds it dynamically using the :py:meth:`_render` method. + This is created by a :class:`.SmartList` object when slicing. It does not + actually store the list at any time; instead, whenever the list is needed, + it builds it dynamically using the :meth:`_render` method. """ def __init__(self, parent, sliceinfo): diff --git a/mwparserfromhell/string_mixin.py b/mwparserfromhell/string_mixin.py index fe41d6d..8da8692 100644 --- a/mwparserfromhell/string_mixin.py +++ b/mwparserfromhell/string_mixin.py @@ -21,7 +21,7 @@ # SOFTWARE. """ -This module contains the :py:class:`~.StringMixIn` type, which implements the +This module contains the :class:`.StringMixIn` type, which implements the interface for the ``unicode`` type (``str`` on py3k) in a dynamic manner. """ @@ -35,7 +35,7 @@ __all__ = ["StringMixIn"] def inheritdoc(method): """Set __doc__ of *method* to __doc__ of *method* in its parent class. - Since this is used on :py:class:`~.StringMixIn`, the "parent class" used is + Since this is used on :class:`.StringMixIn`, the "parent class" used is ``str``. This function can be used as a decorator. """ method.__doc__ = getattr(str, method.__name__).__doc__ @@ -44,11 +44,10 @@ def inheritdoc(method): class StringMixIn(object): """Implement the interface for ``unicode``/``str`` in a dynamic manner. - To use this class, inherit from it and override the :py:meth:`__unicode__` + To use this class, inherit from it and override the :meth:`__unicode__` method (same on py3k) to return the string representation of the object. - The various string methods will operate on the value of - :py:meth:`__unicode__` instead of the immutable ``self`` like the regular - ``str`` type. + The various string methods will operate on the value of :meth:`__unicode__` + instead of the immutable ``self`` like the regular ``str`` type. """ if py3k: diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 8dc5e4e..8f518a6 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -34,18 +34,18 @@ from .smart_list import SmartList __all__ = ["parse_anything"] def parse_anything(value, context=0, skip_style_tags=False): - """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. + """Return a :class:`.Wikicode` for *value*, allowing multiple types. - This differs from :py:meth:`.Parser.parse` in that we accept more than just - a string to be parsed. Unicode objects (strings in py3k), strings (bytes in - py3k), integers (converted to strings), ``None``, existing - :py:class:`~.Node` or :py:class:`~.Wikicode` objects, as well as an - iterable of these types, are supported. This is used to parse input - on-the-fly by various methods of :py:class:`~.Wikicode` and others like - :py:class:`~.Template`, such as :py:meth:`wikicode.insert() - <.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`. + This differs from :meth:`.Parser.parse` in that we accept more than just a + string to be parsed. Unicode objects (strings in py3k), strings (bytes in + py3k), integers (converted to strings), ``None``, existing :class:`.Node` + or :class:`.Wikicode` objects, as well as an iterable of these types, are + supported. This is used to parse input on-the-fly by various methods of + :class:`.Wikicode` and others like :class:`.Template`, such as + :meth:`wikicode.insert() <.Wikicode.insert>` or setting + :meth:`template.name <.Template.name>`. - Additional arguments are passed directly to :py:meth:`.Parser.parse`. + Additional arguments are passed directly to :meth:`.Parser.parse`. """ from .parser import Parser from .wikicode import Wikicode diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index ffa6790..c24bc5f 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -39,8 +39,8 @@ class Wikicode(StringMixIn): Additionally, it contains methods that can be used to extract data from or modify the nodes, implemented in an interface similar to a list. For - example, :py:meth:`index` can get the index of a node in the list, and - :py:meth:`insert` can add a new node at that index. The :py:meth:`filter() + example, :meth:`index` can get the index of a node in the list, and + :meth:`insert` can add a new node at that index. The :meth:`filter() ` series of functions is very useful for extracting and iterating over, for example, all of the templates in the object. """ @@ -55,7 +55,7 @@ class Wikicode(StringMixIn): @staticmethod def _get_children(node, contexts=False, restrict=None, parent=None): - """Iterate over all child :py:class:`.Node`\ s of a given *node*.""" + """Iterate over all child :class:`.Node`\ s of a given *node*.""" yield (parent, node) if contexts else node if restrict and isinstance(node, restrict): return @@ -74,7 +74,7 @@ class Wikicode(StringMixIn): @staticmethod def _build_matcher(matches, flags): - """Helper for :py:meth:`_indexed_ifilter` and others. + """Helper for :meth:`_indexed_ifilter` and others. If *matches* is a function, return it. If it's a regex, return a wrapper around it that can be called with a node to do a search. If @@ -90,7 +90,7 @@ class Wikicode(StringMixIn): forcetype=None): """Iterate over nodes and their corresponding indices in the node list. - The arguments are interpreted as for :py:meth:`ifilter`. For each tuple + The arguments are interpreted as for :meth:`ifilter`. For each tuple ``(i, node)`` yielded by this method, ``self.index(node) == i``. Note that if *recursive* is ``True``, ``self.nodes[i]`` might not be the node itself, but will still contain it. @@ -111,14 +111,14 @@ class Wikicode(StringMixIn): def _do_strong_search(self, obj, recursive=True): """Search for the specific element *obj* within the node list. - *obj* can be either a :py:class:`.Node` or a :py:class:`.Wikicode` - object. If found, we return a tuple (*context*, *index*) where - *context* is the :py:class:`.Wikicode` that contains *obj* and *index* - is its index there, as a :py:class:`slice`. Note that if *recursive* is - ``False``, *context* will always be ``self`` (since we only look for - *obj* among immediate descendants), but if *recursive* is ``True``, - then it could be any :py:class:`.Wikicode` contained by a node within - ``self``. If *obj* is not found, :py:exc:`ValueError` is raised. + *obj* can be either a :class:`.Node` or a :class:`.Wikicode` object. If + found, we return a tuple (*context*, *index*) where *context* is the + :class:`.Wikicode` that contains *obj* and *index* is its index there, + as a :class:`slice`. Note that if *recursive* is ``False``, *context* + will always be ``self`` (since we only look for *obj* among immediate + descendants), but if *recursive* is ``True``, then it could be any + :class:`.Wikicode` contained by a node within ``self``. If *obj* is not + found, :exc:`ValueError` is raised. """ if isinstance(obj, Node): mkslice = lambda i: slice(i, i + 1) @@ -141,14 +141,14 @@ class Wikicode(StringMixIn): def _do_weak_search(self, obj, recursive): """Search for an element that looks like *obj* within the node list. - This follows the same rules as :py:meth:`_do_strong_search` with some + This follows the same rules as :meth:`_do_strong_search` with some differences. *obj* is treated as a string that might represent any - :py:class:`.Node`, :py:class:`.Wikicode`, or combination of the two - present in the node list. Thus, matching is weak (using string - comparisons) rather than strong (using ``is``). Because multiple nodes - can match *obj*, the result is a list of tuples instead of just one - (however, :py:exc:`ValueError` is still raised if nothing is found). - Individual matches will never overlap. + :class:`.Node`, :class:`.Wikicode`, or combination of the two present + in the node list. Thus, matching is weak (using string comparisons) + rather than strong (using ``is``). Because multiple nodes can match + *obj*, the result is a list of tuples instead of just one (however, + :exc:`ValueError` is still raised if nothing is found). Individual + matches will never overlap. The tuples contain a new first element, *exact*, which is ``True`` if we were able to match *obj* exactly to one or more adjacent nodes, or @@ -212,19 +212,19 @@ class Wikicode(StringMixIn): def _build_filter_methods(cls, **meths): """Given Node types, build the corresponding i?filter shortcuts. - The should be given as keys storing the method's base name paired - with values storing the corresponding :py:class:`~.Node` type. For - example, the dict may contain the pair ``("templates", Template)``, - which will produce the methods :py:meth:`ifilter_templates` and - :py:meth:`filter_templates`, which are shortcuts for - :py:meth:`ifilter(forcetype=Template) ` and - :py:meth:`filter(forcetype=Template) `, respectively. These + The should be given as keys storing the method's base name paired with + values storing the corresponding :class:`.Node` type. For example, the + dict may contain the pair ``("templates", Template)``, which will + produce the methods :meth:`ifilter_templates` and + :meth:`filter_templates`, which are shortcuts for + :meth:`ifilter(forcetype=Template) ` and + :meth:`filter(forcetype=Template) `, respectively. These shortcuts are added to the class itself, with an appropriate docstring. """ doc = """Iterate over {0}. - This is equivalent to :py:meth:`{1}` with *forcetype* set to - :py:class:`~{2.__module__}.{2.__name__}`. + This is equivalent to :meth:`{1}` with *forcetype* set to + :class:`~{2.__module__}.{2.__name__}`. """ make_ifilter = lambda ftype: (lambda self, *a, **kw: self.ifilter(forcetype=ftype, *a, **kw)) @@ -240,10 +240,10 @@ class Wikicode(StringMixIn): @property def nodes(self): - """A list of :py:class:`~.Node` objects. + """A list of :class:`.Node` objects. - This is the internal data actually stored within a - :py:class:`~.Wikicode` object. + This is the internal data actually stored within a :class:`.Wikicode` + object. """ return self._nodes @@ -260,11 +260,10 @@ class Wikicode(StringMixIn): def set(self, index, value): """Set the ``Node`` at *index* to *value*. - Raises :py:exc:`IndexError` if *index* is out of range, or - :py:exc:`ValueError` if *value* cannot be coerced into one - :py:class:`~.Node`. To insert multiple nodes at an index, use - :py:meth:`get` with either :py:meth:`remove` and :py:meth:`insert` or - :py:meth:`replace`. + Raises :exc:`IndexError` if *index* is out of range, or + :exc:`ValueError` if *value* cannot be coerced into one :class:`.Node`. + To insert multiple nodes at an index, use :meth:`get` with either + :meth:`remove` and :meth:`insert` or :meth:`replace`. """ nodes = parse_anything(value).nodes if len(nodes) > 1: @@ -279,7 +278,7 @@ class Wikicode(StringMixIn): def index(self, obj, recursive=False): """Return the index of *obj* in the list of nodes. - Raises :py:exc:`ValueError` if *obj* is not found. If *recursive* is + Raises :exc:`ValueError` if *obj* is not found. If *recursive* is ``True``, we will look in all nodes of ours and their descendants, and return the index of our direct descendant node within *our* list of nodes. Otherwise, the lookup is done only on direct descendants. @@ -298,9 +297,8 @@ class Wikicode(StringMixIn): def insert(self, index, value): """Insert *value* at *index* in the list of nodes. - *value* can be anything parsable by :py:func:`.parse_anything`, which - includes strings or other :py:class:`~.Wikicode` or :py:class:`~.Node` - objects. + *value* can be anything parsable by :func:`.parse_anything`, which + includes strings or other :class:`.Wikicode` or :class:`.Node` objects. """ nodes = parse_anything(value).nodes for node in reversed(nodes): @@ -309,15 +307,14 @@ class Wikicode(StringMixIn): def insert_before(self, obj, value, recursive=True): """Insert *value* immediately before *obj*. - *obj* can be either a string, a :py:class:`~.Node`, or another - :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, - for example). If *obj* is a string, we will operate on all instances - of that string within the code, otherwise only on the specific instance - given. *value* can be anything parsable by :py:func:`.parse_anything`. - If *recursive* is ``True``, we will try to find *obj* within our child - nodes even if it is not a direct descendant of this - :py:class:`~.Wikicode` object. If *obj* is not found, - :py:exc:`ValueError` is raised. + *obj* can be either a string, a :class:`.Node`, or another + :class:`.Wikicode` object (as created by :meth:`get_sections`, for + example). If *obj* is a string, we will operate on all instances of + that string within the code, otherwise only on the specific instance + given. *value* can be anything parsable by :func:`.parse_anything`. If + *recursive* is ``True``, we will try to find *obj* within our child + nodes even if it is not a direct descendant of this :class:`.Wikicode` + object. If *obj* is not found, :exc:`ValueError` is raised. """ if isinstance(obj, (Node, Wikicode)): context, index = self._do_strong_search(obj, recursive) @@ -333,15 +330,14 @@ class Wikicode(StringMixIn): def insert_after(self, obj, value, recursive=True): """Insert *value* immediately after *obj*. - *obj* can be either a string, a :py:class:`~.Node`, or another - :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, - for example). If *obj* is a string, we will operate on all instances - of that string within the code, otherwise only on the specific instance - given. *value* can be anything parsable by :py:func:`.parse_anything`. - If *recursive* is ``True``, we will try to find *obj* within our child - nodes even if it is not a direct descendant of this - :py:class:`~.Wikicode` object. If *obj* is not found, - :py:exc:`ValueError` is raised. + *obj* can be either a string, a :class:`.Node`, or another + :class:`.Wikicode` object (as created by :meth:`get_sections`, for + example). If *obj* is a string, we will operate on all instances of + that string within the code, otherwise only on the specific instance + given. *value* can be anything parsable by :func:`.parse_anything`. If + *recursive* is ``True``, we will try to find *obj* within our child + nodes even if it is not a direct descendant of this :class:`.Wikicode` + object. If *obj* is not found, :exc:`ValueError` is raised. """ if isinstance(obj, (Node, Wikicode)): context, index = self._do_strong_search(obj, recursive) @@ -357,15 +353,14 @@ class Wikicode(StringMixIn): def replace(self, obj, value, recursive=True): """Replace *obj* with *value*. - *obj* can be either a string, a :py:class:`~.Node`, or another - :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, - for example). If *obj* is a string, we will operate on all instances - of that string within the code, otherwise only on the specific instance - given. *value* can be anything parsable by :py:func:`.parse_anything`. + *obj* can be either a string, a :class:`.Node`, or another + :class:`.Wikicode` object (as created by :meth:`get_sections`, for + example). If *obj* is a string, we will operate on all instances of + that string within the code, otherwise only on the specific instance + given. *value* can be anything parsable by :func:`.parse_anything`. If *recursive* is ``True``, we will try to find *obj* within our child - nodes even if it is not a direct descendant of this - :py:class:`~.Wikicode` object. If *obj* is not found, - :py:exc:`ValueError` is raised. + nodes even if it is not a direct descendant of this :class:`.Wikicode` + object. If *obj* is not found, :exc:`ValueError` is raised. """ if isinstance(obj, (Node, Wikicode)): context, index = self._do_strong_search(obj, recursive) @@ -384,7 +379,7 @@ class Wikicode(StringMixIn): def append(self, value): """Insert *value* at the end of the list of nodes. - *value* can be anything parsable by :py:func:`.parse_anything`. + *value* can be anything parsable by :func:`.parse_anything`. """ nodes = parse_anything(value).nodes for node in nodes: @@ -393,14 +388,14 @@ class Wikicode(StringMixIn): def remove(self, obj, recursive=True): """Remove *obj* from the list of nodes. - *obj* can be either a string, a :py:class:`~.Node`, or another - :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, - for example). If *obj* is a string, we will operate on all instances - of that string within the code, otherwise only on the specific instance + *obj* can be either a string, a :class:`.Node`, or another + :class:`.Wikicode` object (as created by :meth:`get_sections`, for + example). If *obj* is a string, we will operate on all instances of + that string within the code, otherwise only on the specific instance given. If *recursive* is ``True``, we will try to find *obj* within our child nodes even if it is not a direct descendant of this - :py:class:`~.Wikicode` object. If *obj* is not found, - :py:exc:`ValueError` is raised. + :class:`.Wikicode` object. If *obj* is not found, :exc:`ValueError` is + raised. """ if isinstance(obj, (Node, Wikicode)): context, index = self._do_strong_search(obj, recursive) @@ -417,10 +412,10 @@ class Wikicode(StringMixIn): def matches(self, other): """Do a loose equivalency test suitable for comparing page names. - *other* can be any string-like object, including - :py:class:`~.Wikicode`, or a tuple of these. This operation is - symmetric; both sides are adjusted. Specifically, whitespace and markup - is stripped and the first letter's case is normalized. Typical usage is + *other* can be any string-like object, including :class:`.Wikicode`, or + a tuple of these. This operation is symmetric; both sides are adjusted. + Specifically, whitespace and markup is stripped and the first letter's + case is normalized. Typical usage is ``if template.name.matches("stub"): ...``. """ cmp = lambda a, b: (a[0].upper() + a[1:] == b[0].upper() + b[1:] @@ -453,12 +448,12 @@ class Wikicode(StringMixIn): ["{{foo}}", "{{foo|{{bar}}}}"] *matches* can be used to further restrict the nodes, either as a - function (taking a single :py:class:`.Node` and returning a boolean) or - a regular expression (matched against the node's string representation - with :py:func:`re.search`). If *matches* is a regex, the flags passed - to :py:func:`re.search` are :py:const:`re.IGNORECASE`, - :py:const:`re.DOTALL`, and :py:const:`re.UNICODE`, but custom flags can - be specified by passing *flags*. + function (taking a single :class:`.Node` and returning a boolean) or a + regular expression (matched against the node's string representation + with :func:`re.search`). If *matches* is a regex, the flags passed to + :func:`re.search` are :const:`re.IGNORECASE`, :const:`re.DOTALL`, and + :const:`re.UNICODE`, but custom flags can be specified by passing + *flags*. """ gen = self._indexed_ifilter(recursive, matches, flags, forcetype) return (node for i, node in gen) @@ -466,7 +461,7 @@ class Wikicode(StringMixIn): def filter(self, *args, **kwargs): """Return a list of nodes within our list matching certain conditions. - This is equivalent to calling :py:func:`list` on :py:meth:`ifilter`. + This is equivalent to calling :func:`list` on :meth:`ifilter`. """ return list(self.ifilter(*args, **kwargs)) @@ -474,9 +469,9 @@ class Wikicode(StringMixIn): include_lead=None, include_headings=True): """Return a list of sections within the page. - Sections are returned as :py:class:`~.Wikicode` objects with a shared - node list (implemented using :py:class:`~.SmartList`) so that changes - to sections are reflected in the parent Wikicode object. + Sections are returned as :class:`.Wikicode` objects with a shared node + list (implemented using :class:`.SmartList`) so that changes to + sections are reflected in the parent Wikicode object. Each section contains all of its subsections, unless *flat* is ``True``. If *levels* is given, it should be a iterable of integers; @@ -484,14 +479,13 @@ class Wikicode(StringMixIn): *matches* is given, it should be either a function or a regex; only sections whose headings match it (without the surrounding equal signs) will be included. *flags* can be used to override the default regex - flags (see :py:meth:`ifilter`) if a regex *matches* is used. + flags (see :meth:`ifilter`) if a regex *matches* is used. If *include_lead* is ``True``, the first, lead section (without a heading) will be included in the list; ``False`` will not include it; the default will include it only if no specific *levels* were given. If *include_headings* is ``True``, the section's beginning - :py:class:`~.Heading` object will be included; otherwise, this is - skipped. + :class:`.Heading` object will be included; otherwise, this is skipped. """ title_matcher = self._build_matcher(matches, flags) matcher = lambda heading: (title_matcher(heading.title) and @@ -540,7 +534,7 @@ class Wikicode(StringMixIn): """Return a rendered string without unprintable code such as templates. The way a node is stripped is handled by the - :py:meth:`~.Node.__strip__` method of :py:class:`~.Node` objects, which + :meth:`~.Node.__strip__` method of :class:`.Node` objects, which generally return a subset of their nodes or ``None``. For example, templates and tags are removed completely, links are stripped to just their display part, headings are stripped to just their title. If @@ -568,9 +562,9 @@ class Wikicode(StringMixIn): """Return a hierarchical tree representation of the object. The representation is a string makes the most sense printed. It is - built by calling :py:meth:`_get_tree` on the - :py:class:`~.Wikicode` object and its children recursively. The end - result may look something like the following:: + built by calling :meth:`_get_tree` on the :class:`.Wikicode` object and + its children recursively. The end result may look something like the + following:: >>> text = "Lorem ipsum {{foo|bar|{{baz}}|spam=eggs}}" >>> print mwparserfromhell.parse(text).get_tree() From a8d2983161e422e27e0de8c1261b196e7a79363b Mon Sep 17 00:00:00 2001 From: David Winegar Date: Mon, 14 Jul 2014 10:37:36 -0700 Subject: [PATCH 040/102] Started table parsing in PyTokenizer Started parsing table support and added the start of table support. This is a big commit (ugh) and it should probably be split up into multiple smaller ones if possible, but that seems unworkable as of right now because of all the dependencies. Also breaks tests of CTokenizer (double ugh) because I haven't started table support there. May want to pick line by line on this commit later but I need to save my work for now. --- mwparserfromhell/definitions.py | 2 +- mwparserfromhell/parser/contexts.py | 8 +++- mwparserfromhell/parser/tokenizer.py | 76 +++++++++++++++++++++++++++++++++--- tests/tokenizer/tables.mwtest | 32 +++++++++++++++ 4 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 tests/tokenizer/tables.mwtest diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 6020ad1..af41f49 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -52,7 +52,7 @@ INVISIBLE_TAGS = [ # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] -SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] +SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] MARKUP_TO_HTML = { "#": "li", diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index f568fac..678a392 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -155,13 +155,19 @@ FAIL_ON_EQUALS = 1 << 29 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) +TABLE_OPEN = 1 << 30 +TABLE_CELL_LINE = 1 << 31 +TABLE_HEADER_LINE = 1 << 32 +TABLE_CELL_OPEN = 1 << 33 +TABLE_CELL_STYLE_POSSIBLE = 1 << 34 + # Global contexts: GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 073e64c..70e2d5d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,6 +1002,39 @@ class Tokenizer(object): self._fail_route() return self._pop() + def _handle_table_start(self): + """Handle the start of a table.""" + # TODO - fail all other contexts on start? + self._head += 2 + reset = self._head - 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + else: + self._emit_style_tag("table", "{|", table) + + def _handle_table_end(self): + self._head += 2 + return self._pop() + + def _handle_table_row(self): + self._head += 2 + self._emit(tokens.TagOpenOpen(wiki_markup="{-")) + self._emit_text("tr") + self._emit(tokens.TagCloseSelfclose()) + self._context &= ~contexts.TABLE_CELL_OPEN + + def _handle_table_cell(self): + pass + + def _handle_header_cell(self): + pass + + def _handle_cell_style(self): + pass + def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" context = self._context @@ -1144,15 +1177,48 @@ class Tokenizer(object): result = self._parse_style() if result is not None: return result - elif self._read(-1) in ("\n", self.START): - if this in ("#", "*", ";", ":"): + elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): self._handle_list() - elif this == next == self._read(2) == self._read(3) == "-": + elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-": self._handle_hr() - else: - self._emit_text(this) elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() + + elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or + (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + if self._can_recurse(): + self._handle_table_start() + else: + self._emit_text("{|") + elif self._context & contexts.TABLE_OPEN: + if this == "|" and next == "}": + return self._handle_table_end() + elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE: + self._handle_table_cell() + elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_header_cell() + elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_header_cell() + elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: + self._handle_cell_style() + # on newline, clear out cell line contexts + elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE): + self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE) + self._emit_text(this) + # newline or whitespace/newline + elif (self._read(-1) in ("\n", self.START) or + (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + if this == "|" and next == "-": + self._handle_table_row() + elif this == "|" and self._can_recurse(): + self._handle_table_cell() + elif this == "!" and self._can_recurse(): + self._handle_header_cell() + else: + self._emit_text(this) + else: + self._emit_text(this) + else: self._emit_text(this) self._head += 1 diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest new file mode 100644 index 0000000..399f7fd --- /dev/null +++ b/tests/tokenizer/tables.mwtest @@ -0,0 +1,32 @@ +name: empty_table +label: Parsing an empty table. +input: "{|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: inline_table +label: Correctly handle tables with close on the same line. +input: "{||}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_close_simple +label: Handle case when there is no table close. +input: "{| " +output: [Text(text="{| ")] + +--- + +name: leading_whitespace_table +label: Handle leading whitespace for a table. +input: "foo \n \t {|\n|}" +output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: leading_characters_table +label: Don't parse as a table when leading characters are not newline or whitespace. +input: "foo \n foo \t {|\n|}" +output: [Text(text="foo \n foo \t {|\n|}")] From b7e40d7b5aea817c23de68326627c263652cc36c Mon Sep 17 00:00:00 2001 From: David Winegar Date: Mon, 14 Jul 2014 16:03:09 -0700 Subject: [PATCH 041/102] Table cells now recurse Added another stack layer for tokenizing table cells because of styling/correctness of implementation. Added many tests cases. --- mwparserfromhell/parser/tokenizer.py | 68 ++++++++++++++++++++++++++---------- tests/tokenizer/tables.mwtest | 56 +++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 18 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 70e2d5d..80cb501 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1020,17 +1020,34 @@ class Tokenizer(object): return self._pop() def _handle_table_row(self): - self._head += 2 - self._emit(tokens.TagOpenOpen(wiki_markup="{-")) + self._head += 1 + self._emit(tokens.TagOpenOpen(wiki_markup="|-")) self._emit_text("tr") self._emit(tokens.TagCloseSelfclose()) - self._context &= ~contexts.TABLE_CELL_OPEN - def _handle_table_cell(self): - pass + def _handle_table_cell(self, markup, tag, line_context): + """Parse as normal syntax unless we hit a style marker, then parse as HTML attributes""" + if not self._can_recurse(): + self._emit_text(markup) + self._head += len(markup) - 1 + return - def _handle_header_cell(self): - pass + reset = self._head + self._head += len(markup) + try: + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | contexts.TABLE_CELL_STYLE_POSSIBLE | line_context) + except BadRoute: + self._head = reset + raise + else: + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + self._emit_text(tag) + self._emit(tokens.TagCloseSelfclose()) + self._emit_all(cell) + self._head -= 1 + + def _handle_table_cell_end(self): + return self._pop() def _handle_cell_style(self): pass @@ -1184,36 +1201,51 @@ class Tokenizer(object): elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() - elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or - (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or + (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): self._handle_table_start() else: self._emit_text("{|") elif self._context & contexts.TABLE_OPEN: if this == "|" and next == "}": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() return self._handle_table_end() elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE: - self._handle_table_cell() + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("||", "td", contexts.TABLE_CELL_LINE) elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE: - self._handle_header_cell() + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("||", "th", contexts.TABLE_HEADER_LINE) elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE: - self._handle_header_cell() + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("!!", "th", contexts.TABLE_HEADER_LINE) elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: self._handle_cell_style() # on newline, clear out cell line contexts elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE): + # TODO might not be handled due to DL_TERM code above + # TODO does this even work? self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE) self._emit_text(this) - # newline or whitespace/newline elif (self._read(-1) in ("\n", self.START) or - (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if this == "|" and next == "-": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() self._handle_table_row() - elif this == "|" and self._can_recurse(): - self._handle_table_cell() - elif this == "!" and self._can_recurse(): - self._handle_header_cell() + elif this == "|": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("|", "td", contexts.TABLE_CELL_LINE) + elif this == "!": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("!", "th", contexts.TABLE_HEADER_LINE) else: self._emit_text(this) else: diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 399f7fd..f818f65 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -19,6 +19,13 @@ output: [Text(text="{| ")] --- +name: no_table_close_inside_cell +label: Handle case when there is no table close while inside of a cell. +input: "{| | " +output: [Text(text="{| | ")] + +--- + name: leading_whitespace_table label: Handle leading whitespace for a table. input: "foo \n \t {|\n|}" @@ -30,3 +37,52 @@ name: leading_characters_table label: Don't parse as a table when leading characters are not newline or whitespace. input: "foo \n foo \t {|\n|}" output: [Text(text="foo \n foo \t {|\n|}")] + +--- + +name: table_row_simple +label: Simple table row. +input: "{|\n |- \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_simple +label: Simple table cell. +input: "{|\n | foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: nowiki_inside_table +label: Nowiki handles pipe characters in tables. +input: "{|\n | foo | |- {| |} || ! !! bar \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_text_outside_cell +label: Parse text inside table but outside of a cell. +input: "{|\n bar \n | foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_cell_with_leading_characters +label: Fail to create a table cell when there are leading non-whitespace characters. +input: "{|\n bar | foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar | foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_row_with_leading_characters +label: Fail to create a table row when there are leading non-whitespace characters. +input: "{|\n bar |- foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar |- foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: template_inside_table_cell +label: Template within table cell. +input: "{|\n |{{foo\n|bar=baz}} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] From a13bc948fae32485087feae30b115728885a7abf Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 10:17:23 -0700 Subject: [PATCH 042/102] Started table cell attribute support Started support for parsing table style attributes. I suspect some of this is incorrect, need to add more tests to see. --- mwparserfromhell/parser/tokenizer.py | 66 +++++++++++++++++++++++++++++++----- tests/tokenizer/tables.mwtest | 35 +++++++++++++++++++ 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 80cb501..f09adc8 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1027,30 +1027,78 @@ class Tokenizer(object): def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse as HTML attributes""" + table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context if not self._can_recurse(): self._emit_text(markup) + # TODO check if this works self._head += len(markup) - 1 return reset = self._head self._head += len(markup) + style = None try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | contexts.TABLE_CELL_STYLE_POSSIBLE | line_context) + (cell_context, cell) = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) except BadRoute: self._head = reset raise - else: - self._emit(tokens.TagOpenOpen(wiki_markup=markup)) - self._emit_text(tag) - self._emit(tokens.TagCloseSelfclose()) - self._emit_all(cell) - self._head -= 1 + # except for handling cell style + except StopIteration: + self._head = reset + len(markup) + try: + style = self._parse_as_table_style("|") + (cell_context, cell) = self._parse(table_context) + except BadRoute: + assert False + self._head = reset + raise + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + self._emit_text(tag) + if style: + # this looks highly suspicious + if type(style[0] == tokens.Text): + style.pop(0) + self._emit_all(style) + self._emit(tokens.TagCloseSelfclose()) + self._emit_all(cell) + # keep header/cell line contexts + self._context |= cell_context & (contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_LINE) + # offset displacement done by _parse() + self._head -= 1 + + def _parse_as_table_style(self, end_token): + data = _TagOpenData() + data.context = _TagOpenData.CX_ATTR_READY + while True: + this, next = self._read(), self._read(1) + can_exit = (not data.context & (data.CX_NAME) or + data.context & data.CX_NOTE_SPACE) + if this is self.END: + if self._context & contexts.TAG_ATTR: + if data.context & data.CX_QUOTED: + # Unclosed attribute quote: reset, don't die + data.context = data.CX_ATTR_VALUE + self._pop() + self._head = data.reset + continue + self._pop() + self._fail_route() + elif this == end_token and can_exit: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + self._head += 1 + return self._pop() + else: + self._handle_tag_data(data, this) + self._head += 1 def _handle_table_cell_end(self): - return self._pop() + """Returns the context and stack in a tuple.""" + return (self._context, self._pop()) def _handle_cell_style(self): - pass + """Pop the cell off the stack and try to parse as style""" + raise StopIteration() def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index f818f65..e7eb40c 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -54,6 +54,13 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text --- +name: table_cell_inline +label: Multiple inline table cells. +input: "{|\n | foo || bar || test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" @@ -86,3 +93,31 @@ name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes +label: Parse table cell style attributes. +input: "{| \n | name="foo bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_quote_with_pipe +label: Pipe inside an attribute quote should still be used as a style separator. +input: "{| \n | name="foo|bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(), Text(text="bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_name_with_pipe +label: Pipe inside an attribute name should still be used as a style separator. +input: "{| \n | name|="foo bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_pipe_after_equals +label: Pipe inside an attribute should still be used as a style separator after an equals. +input: "{| \n | name=|"foo|bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] From 0bba69d5dc32bea027a13573490263530456269d Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 10:23:44 -0700 Subject: [PATCH 043/102] Added tests/support for header cells Support for header cells was mostly in already, just needed minor changes. Added two tests as well. --- mwparserfromhell/parser/tokenizer.py | 2 +- tests/tokenizer/tables.mwtest | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f09adc8..b899e75 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -63,7 +63,7 @@ class Tokenizer(object): START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", - ":", "/", "-", "\n", START, END] + ":", "/", "-", "!", "\n", START, END] MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index e7eb40c..1087381 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -61,6 +61,20 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text --- +name: table_header_simple +label: Simple header cell. +input: "{|\n ! foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_header_inline +label: Multiple inline header cells. +input: "{|\n ! foo || bar !! test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" From 9f159ecfa2443cbacf542c174058f3cd37eeb08d Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 13:32:33 -0700 Subject: [PATCH 044/102] Add table start/row start style attribute support Started styling attributes for table row and table start. Still not entirely sure about this, definitely need to make changes regarding padding. --- mwparserfromhell/parser/tokenizer.py | 49 ++++++++++++++++++++++++++++++------ tests/tokenizer/tables.mwtest | 24 +++++++++++++++++- 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b899e75..c2d5240 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1007,23 +1007,53 @@ class Tokenizer(object): # TODO - fail all other contexts on start? self._head += 2 reset = self._head - 1 + style = None try: + self._push(contexts.TABLE_OPEN) + style = self._parse_as_table_style("\n", break_on_table_end=True) + if len(style) == 0: + self._head = reset + 1 table = self._parse(contexts.TABLE_OPEN) except BadRoute: self._head = reset self._emit_text("{|") else: - self._emit_style_tag("table", "{|", table) + self._emit(tokens.TagOpenOpen(wiki_markup="{|")) + self._emit_text("table") + if style: + self._emit_all(style) + self._emit(tokens.TagCloseOpen()) + self._emit_all(table) + self._emit(tokens.TagOpenClose()) + self._emit_text("table") + self._emit(tokens.TagCloseClose()) + # self._emit_style_tag("table", "{|", table) def _handle_table_end(self): self._head += 2 return self._pop() def _handle_table_row(self): - self._head += 1 - self._emit(tokens.TagOpenOpen(wiki_markup="|-")) - self._emit_text("tr") - self._emit(tokens.TagCloseSelfclose()) + reset = self._head + self._head += 2 + try: + self._push(contexts.TABLE_OPEN) + style = self._parse_as_table_style("\n") + if len(style) == 0: + self._head = reset + 2 + except BadRoute: + self._head = reset + raise + else: + self._emit(tokens.TagOpenOpen(wiki_markup="|-")) + self._emit_text("tr") + if style: + # this looks highly suspicious + # if type(style[0] == tokens.Text): + # style.pop(0) + self._emit_all(style) + self._emit(tokens.TagCloseSelfclose()) + self._head -= 1 def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse as HTML attributes""" @@ -1047,9 +1077,10 @@ class Tokenizer(object): self._head = reset + len(markup) try: style = self._parse_as_table_style("|") + # Don't parse the style separator + self._head += 1 (cell_context, cell) = self._parse(table_context) except BadRoute: - assert False self._head = reset raise self._emit(tokens.TagOpenOpen(wiki_markup=markup)) @@ -1066,7 +1097,7 @@ class Tokenizer(object): # offset displacement done by _parse() self._head -= 1 - def _parse_as_table_style(self, end_token): + def _parse_as_table_style(self, end_token, break_on_table_end=False): data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: @@ -1086,7 +1117,9 @@ class Tokenizer(object): elif this == end_token and can_exit: if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) - self._head += 1 + # self._head += 1 + return self._pop() + elif break_on_table_end and this == "|" and next == "}": return self._pop() else: self._handle_tag_data(data, this) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 1087381..fa068fd 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -127,7 +127,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] --- @@ -135,3 +135,25 @@ name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_row_attributes +label: Parse table row style attributes. +input: "{| \n |- name="foo bar"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_row_attributes_crazy_whitespace +label: Parse table row style attributes with different whitespace. +input: "{| \t \n |- \t name="foo bar"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \t \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + + +--- + +name: table_attributes +label: Parse table style attributes. +input: "{| name="foo bar"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] From d356a570b32d849ba581a02b77f2aa5b8cdb8ba2 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 14:37:58 -0700 Subject: [PATCH 045/102] Added closing_wiki_markup support to Tag node Added support for allowing different wiki syntax for replacing the opening and closing tags. Added for table support. --- mwparserfromhell/nodes/tag.py | 34 +++++++++++++++++++++++++-- mwparserfromhell/parser/builder.py | 4 +++- tests/test_tag.py | 18 +++++++++++++++ tests/tokenizer/tables.mwtest | 47 ++++++++++++++++++++++---------------- 4 files changed, 80 insertions(+), 23 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 7cbe78d..0fe580f 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -35,7 +35,7 @@ class Tag(Node): def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, self_closing=False, invalid=False, implicit=False, padding="", - closing_tag=None): + closing_tag=None, closing_wiki_markup=None): super(Tag, self).__init__() self._tag = tag if contents is None and not self_closing: @@ -44,6 +44,13 @@ class Tag(Node): self._contents = contents self._attrs = attrs if attrs else [] self._wiki_markup = wiki_markup + if wiki_markup and not self_closing: + if closing_wiki_markup: + self._closing_wiki_markup = closing_wiki_markup + else: + self._closing_wiki_markup = wiki_markup + else: + self._closing_wiki_markup = None self._self_closing = self_closing self._invalid = invalid self._implicit = implicit @@ -55,10 +62,11 @@ class Tag(Node): def __unicode__(self): if self.wiki_markup: + attrs = "".join([str(attr) for attr in self.attributes]) if self.attributes else "" if self.self_closing: return self.wiki_markup else: - return self.wiki_markup + str(self.contents) + self.wiki_markup + return self.wiki_markup + attrs + str(self.contents) + self.closing_wiki_markup result = ("``).""" return self._self_closing @@ -185,10 +206,19 @@ class Tag(Node): @wiki_markup.setter def wiki_markup(self, value): self._wiki_markup = str(value) if value else None + if not value or not self.closing_wiki_markup: + self.closing_wiki_markup = str(value) if value else None + + + @closing_wiki_markup.setter + def closing_wiki_markup(self, value): + self._closing_wiki_markup = str(value) if value and not self.self_closing else None @self_closing.setter def self_closing(self, value): self._self_closing = bool(value) + if not bool(value): + self.closing_wiki_markup = None @invalid.setter def invalid(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 2d68036..8d1852e 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -248,6 +248,7 @@ class Builder(object): close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) implicit, attrs, contents, closing_tag = False, [], None, None wiki_markup, invalid = token.wiki_markup, token.invalid or False + closing_wiki_markup = None self._push() while self._tokens: token = self._tokens.pop() @@ -258,6 +259,7 @@ class Builder(object): tag = self._pop() self._push() elif isinstance(token, tokens.TagOpenClose): + closing_wiki_markup = token.wiki_markup contents = self._pop() self._push() elif isinstance(token, close_tokens): @@ -270,7 +272,7 @@ class Builder(object): self_closing = False closing_tag = self._pop() return Tag(tag, contents, attrs, wiki_markup, self_closing, - invalid, implicit, padding, closing_tag) + invalid, implicit, padding, closing_tag, closing_wiki_markup) else: self._write(self._handle_token(token)) raise ParserError("_handle_tag() missed a close token") diff --git a/tests/test_tag.py b/tests/test_tag.py index 7577cce..950233f 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -171,6 +171,24 @@ class TestTag(TreeEqualityTestCase): self.assertFalse(node.wiki_markup) self.assertEqual("italic text", node) + def test_closing_wiki_markup(self): + """test getter/setter behavior for closing_wiki_markup attribute""" + node = Tag(wraptext("table"), wraptext("\n")) + self.assertIs(None, node.closing_wiki_markup) + node.wiki_markup = "{|" + self.assertEqual("{|", node.closing_wiki_markup) + node.closing_wiki_markup = "|}" + self.assertEqual("|}", node.closing_wiki_markup) + self.assertEqual("{|\n|}", node) + node.wiki_markup = False + self.assertFalse(node.closing_wiki_markup) + node.self_closing = True + node.wiki_markup = "{|" + self.assertIs(None, node.closing_wiki_markup) + node.wiki_markup = False + node.self_closing = False + self.assertEqual("\n
    ", node) + def test_self_closing(self): """test getter/setter for the self_closing attribute""" node = Tag(wraptext("ref"), wraptext("foobar")) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index fa068fd..bfdd83f 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -1,14 +1,14 @@ name: empty_table label: Parsing an empty table. input: "{|\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_table label: Correctly handle tables with close on the same line. input: "{||}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -29,7 +29,7 @@ output: [Text(text="{| | ")] name: leading_whitespace_table label: Handle leading whitespace for a table. input: "foo \n \t {|\n|}" -output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -43,112 +43,119 @@ output: [Text(text="foo \n foo \t {|\n|}")] name: table_row_simple label: Simple table row. input: "{|\n |- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_inline label: Multiple inline table cells. input: "{|\n | foo || bar || test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_inline label: Multiple inline header cells. input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell label: Parse text inside table but outside of a cell. input: "{|\n bar \n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_cell_with_leading_characters label: Fail to create a table cell when there are leading non-whitespace characters. input: "{|\n bar | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar | foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_row_with_leading_characters label: Fail to create a table row when there are leading non-whitespace characters. input: "{|\n bar |- foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar |- foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes label: Parse table cell style attributes. input: "{| \n | name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(), Text(text="bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_templates +label: Pipe inside attributes shouldn't be style separator. +input: "{| \n | {{comment|template=baz}} | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes label: Parse table row style attributes. input: "{| \n |- name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \t \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \t \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -156,4 +163,4 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text name: table_attributes label: Parse table style attributes. input: "{| name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] From 9e4bb0c7e5b0289bc110cb41619b883b57f55954 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 15:45:53 -0700 Subject: [PATCH 046/102] Clean up and style changes Added comments, tried to keep to 80 character lines. --- mwparserfromhell/parser/contexts.py | 24 ++++++++++--- mwparserfromhell/parser/tokenizer.py | 67 +++++++++++++++++++----------------- 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 678a392..564ceca 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -90,6 +90,15 @@ Local (stack-specific) contexts: * :const:`FAIL_ON_RBRACE` * :const:`FAIL_ON_EQUALS` +* :const:`TABLE` + + * :const:`TABLE_OPEN` + * :const:`TABLE_CELL_OPEN` + * :const:`TABLE_CELL_STYLE_POSSIBLE` + * :const:`TABLE_TD_LINE` + * :const:`TABLE_TH_LINE` + * :const:`TABLE_CELL_LINE_CONTEXTS` + Global contexts: * :const:`GL_HEADING` @@ -156,10 +165,14 @@ SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) TABLE_OPEN = 1 << 30 -TABLE_CELL_LINE = 1 << 31 -TABLE_HEADER_LINE = 1 << 32 -TABLE_CELL_OPEN = 1 << 33 -TABLE_CELL_STYLE_POSSIBLE = 1 << 34 +TABLE_CELL_OPEN = 1 << 31 +TABLE_CELL_STYLE_POSSIBLE = 1 << 32 +TABLE_TD_LINE = 1 << 33 +TABLE_TH_LINE = 1 << 34 +TABLE_CELL_LINE_CONTEXTS = (TABLE_TD_LINE + TABLE_TH_LINE + + TABLE_CELL_STYLE_POSSIBLE) +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE_POSSIBLE + + TABLE_TD_LINE + TABLE_TH_LINE) # Global contexts: @@ -167,7 +180,8 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN +FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + + STYLE + TABLE_OPEN) UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index c2d5240..4a9c0f5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1004,18 +1004,18 @@ class Tokenizer(object): def _handle_table_start(self): """Handle the start of a table.""" - # TODO - fail all other contexts on start? self._head += 2 - reset = self._head - 1 + reset = self._head style = None try: self._push(contexts.TABLE_OPEN) style = self._parse_as_table_style("\n", break_on_table_end=True) if len(style) == 0: - self._head = reset + 1 + self._head = reset table = self._parse(contexts.TABLE_OPEN) except BadRoute: - self._head = reset + # offset displacement done by _parse() + self._head = reset - 1 self._emit_text("{|") else: self._emit(tokens.TagOpenOpen(wiki_markup="{|")) @@ -1024,16 +1024,22 @@ class Tokenizer(object): self._emit_all(style) self._emit(tokens.TagCloseOpen()) self._emit_all(table) - self._emit(tokens.TagOpenClose()) + self._emit(tokens.TagOpenClose(wiki_markup="|}")) self._emit_text("table") self._emit(tokens.TagCloseClose()) - # self._emit_style_tag("table", "{|", table) def _handle_table_end(self): + """Return the stack in order to handle the table end.""" self._head += 2 return self._pop() def _handle_table_row(self): + """Parse as style until end of the line, then continue.""" + if not self._can_recurse(): + self._emit_text("|-") + self._head += 2 + return + reset = self._head self._head += 2 try: @@ -1048,22 +1054,20 @@ class Tokenizer(object): self._emit(tokens.TagOpenOpen(wiki_markup="|-")) self._emit_text("tr") if style: - # this looks highly suspicious - # if type(style[0] == tokens.Text): - # style.pop(0) self._emit_all(style) self._emit(tokens.TagCloseSelfclose()) + # offset displacement done by _parse() self._head -= 1 def _handle_table_cell(self, markup, tag, line_context): - """Parse as normal syntax unless we hit a style marker, then parse as HTML attributes""" - table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context + """Parse as normal syntax unless we hit a style marker, then parse style + as HTML attributes and the remainder as normal syntax.""" if not self._can_recurse(): self._emit_text(markup) - # TODO check if this works self._head += len(markup) - 1 return + table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context reset = self._head self._head += len(markup) style = None @@ -1074,8 +1078,10 @@ class Tokenizer(object): raise # except for handling cell style except StopIteration: + self._pop() self._head = reset + len(markup) try: + self._push(table_context) style = self._parse_as_table_style("|") # Don't parse the style separator self._head += 1 @@ -1083,21 +1089,20 @@ class Tokenizer(object): except BadRoute: self._head = reset raise + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) if style: - # this looks highly suspicious - if type(style[0] == tokens.Text): - style.pop(0) self._emit_all(style) self._emit(tokens.TagCloseSelfclose()) self._emit_all(cell) # keep header/cell line contexts - self._context |= cell_context & (contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_LINE) + self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) # offset displacement done by _parse() self._head -= 1 def _parse_as_table_style(self, end_token, break_on_table_end=False): + """Parse until ``end_token`` as style attributes for a table.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: @@ -1117,7 +1122,6 @@ class Tokenizer(object): elif this == end_token and can_exit: if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) - # self._head += 1 return self._pop() elif break_on_table_end and this == "|" and next == "}": return self._pop() @@ -1130,7 +1134,7 @@ class Tokenizer(object): return (self._context, self._pop()) def _handle_cell_style(self): - """Pop the cell off the stack and try to parse as style""" + """Pop the cell off the stack and try to parse as style.""" raise StopIteration() def _verify_safe(self, this): @@ -1281,7 +1285,10 @@ class Tokenizer(object): self._handle_hr() elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() - + if this == "\n": + # kill potential table contexts + self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS + # Start of table parsing elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): @@ -1293,25 +1300,23 @@ class Tokenizer(object): if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() return self._handle_table_end() - elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE: + elif this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("||", "td", contexts.TABLE_CELL_LINE) - elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) + elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("||", "th", contexts.TABLE_HEADER_LINE) - elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) + elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("!!", "th", contexts.TABLE_HEADER_LINE) + self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: self._handle_cell_style() # on newline, clear out cell line contexts - elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE): - # TODO might not be handled due to DL_TERM code above - # TODO does this even work? - self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE) + elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: + self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS self._emit_text(this) elif (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): @@ -1322,11 +1327,11 @@ class Tokenizer(object): elif this == "|": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("|", "td", contexts.TABLE_CELL_LINE) + self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE) elif this == "!": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("!", "th", contexts.TABLE_HEADER_LINE) + self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE) else: self._emit_text(this) else: From ec080018716f66efdb09332ad6de8bf7b8096e99 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 18:19:48 -0700 Subject: [PATCH 047/102] Tables and rows now use newline as padding Tables and rows use newlines as padding, partly because these characters are pretty important to the integrity of the table. They might need to be in the preceding whitespace of inner tags instead as padding after, not sure. --- mwparserfromhell/nodes/tag.py | 39 +++++++++++----------- mwparserfromhell/parser/builder.py | 1 + mwparserfromhell/parser/tokenizer.py | 32 ++++++++++-------- tests/test_tag.py | 5 --- tests/tokenizer/tables.mwtest | 65 ++++++++++++++++++++++++------------ 5 files changed, 81 insertions(+), 61 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 0fe580f..b3ea85c 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -44,11 +44,10 @@ class Tag(Node): self._contents = contents self._attrs = attrs if attrs else [] self._wiki_markup = wiki_markup - if wiki_markup and not self_closing: - if closing_wiki_markup: - self._closing_wiki_markup = closing_wiki_markup - else: - self._closing_wiki_markup = wiki_markup + if closing_wiki_markup: + self._closing_wiki_markup = closing_wiki_markup + elif wiki_markup and not self_closing: + self._closing_wiki_markup = wiki_markup else: self._closing_wiki_markup = None self._self_closing = self_closing @@ -63,10 +62,12 @@ class Tag(Node): def __unicode__(self): if self.wiki_markup: attrs = "".join([str(attr) for attr in self.attributes]) if self.attributes else "" + close = self.closing_wiki_markup if self.closing_wiki_markup else "" + padding = self.padding if self.padding else "" if self.self_closing: - return self.wiki_markup + return self.wiki_markup + attrs + close + padding else: - return self.wiki_markup + attrs + str(self.contents) + self.closing_wiki_markup + return self.wiki_markup + attrs + padding + str(self.contents) + close result = ("\n", node) def test_self_closing(self): diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index bfdd83f..7cf826c 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -1,14 +1,14 @@ name: empty_table label: Parsing an empty table. input: "{|\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_table label: Correctly handle tables with close on the same line. input: "{||}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=""), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -29,7 +29,7 @@ output: [Text(text="{| | ")] name: leading_whitespace_table label: Handle leading whitespace for a table. input: "foo \n \t {|\n|}" -output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -43,119 +43,133 @@ output: [Text(text="foo \n foo \t {|\n|}")] name: table_row_simple label: Simple table row. input: "{|\n |- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_inline label: Multiple inline table cells. input: "{|\n | foo || bar || test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_inline label: Multiple inline header cells. input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell label: Parse text inside table but outside of a cell. input: "{|\n bar \n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_cell_with_leading_characters label: Fail to create a table cell when there are leading non-whitespace characters. input: "{|\n bar | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_row_with_leading_characters label: Fail to create a table row when there are leading non-whitespace characters. input: "{|\n bar |- foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes label: Parse table cell style attributes. input: "{| \n | name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|"), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|"), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(wiki_markup="|"), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_templates label: Pipe inside attributes shouldn't be style separator. input: "{| \n | {{comment|template=baz}} | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: header_cell_attributes +label: Parse header cell style attributes. +input: "{| \n ! name="foo bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: inline_cell_attributes +label: Parse cell style attributes of inline cells. +input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseSelfclose(wiki_markup="|"), Text(text=" markup"), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|"), Text(text=" time \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes label: Parse table row style attributes. input: "{| \n |- name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. -input: "{| \t \n |- \t name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \t \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +input: "{| \t \n |- \t name="foo bar" \t \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -163,4 +177,11 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text name: table_attributes label: Parse table style attributes. input: "{| name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: inline_table_attributes +label: Correctly handle attributes in inline tables. +input: "{| foo="tee bar" |}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] From f1664a8d67d7544d6524bd8de3ab3e554247bc2e Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 10:00:58 -0700 Subject: [PATCH 048/102] Updated row and table handling Changed row recursion handling to make sure the tag is emitted even when hitting recursion limits. Need to test table recursion to make sure that works. Also fixed a bug in which tables were eating the trailing token. Added several tests for rows and trailing tokens with tables. --- mwparserfromhell/parser/tokenizer.py | 33 ++++++++++++++++----------------- tests/tokenizer/tables.mwtest | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 0829e7d..787ea0a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1027,6 +1027,8 @@ class Tokenizer(object): self._emit(tokens.TagOpenClose(wiki_markup="|}")) self._emit_text("table") self._emit(tokens.TagCloseClose()) + # offset displacement done by _parse() + self._head -= 1 def _handle_table_end(self): """Return the stack in order to handle the table end.""" @@ -1035,25 +1037,22 @@ class Tokenizer(object): def _handle_table_row(self): """Parse as style until end of the line, then continue.""" - if not self._can_recurse(): - self._emit_text("|-") - self._head += 2 - return - reset = self._head self._head += 2 - try: - self._push(contexts.TABLE_OPEN) - (style, padding) = self._parse_as_table_style("\n") - except BadRoute: - self._head = reset - raise - else: - self._emit(tokens.TagOpenOpen(wiki_markup="|-")) - self._emit_text("tr") - if style: - self._emit_all(style) - self._emit(tokens.TagCloseSelfclose(padding=padding)) + style, padding = None, "" + # If we can't recurse, still tokenize tag but parse style attrs as text + if self._can_recurse(): + try: + self._push(contexts.TABLE_OPEN) + (style, padding) = self._parse_as_table_style("\n") + except BadRoute: + self._head = reset + raise + self._emit(tokens.TagOpenOpen(wiki_markup="|-")) + self._emit_text("tr") + if style: + self._emit_all(style) + self._emit(tokens.TagCloseSelfclose(padding=padding)) def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 7cf826c..2770227 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -26,6 +26,13 @@ output: [Text(text="{| | ")] --- +name: no_table_close_inside_row +label: Handle case when there is no table close while inside of a row. +input: "{| |- " +output: [Text(text="{| |- ")] + +--- + name: leading_whitespace_table label: Handle leading whitespace for a table. input: "foo \n \t {|\n|}" @@ -33,6 +40,27 @@ output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="t --- +name: whitespace_after_table +label: Handle whitespace after a table close. +input: "{|\n|}\n \t " +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text="\n \t ")] + +--- + +name: different_whitespace_after_table +label: Handle spaces after a table close. +input: "{|\n|} \n " +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n ")] + +--- + +name: characters_after_table +label: Handle characters after a table close. +input: "{|\n|} tsta" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")] + +--- + name: leading_characters_table label: Don't parse as a table when leading characters are not newline or whitespace. input: "foo \n foo \t {|\n|}" @@ -47,6 +75,13 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- +name: table_row_multiple +label: Simple table row. +input: "{|\n |- \n|- \n |-\n |}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" @@ -171,7 +206,6 @@ label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar" \t \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] - --- name: table_attributes From 842af20c38c65188061811959eac8b6e263fd1f2 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 12:23:38 -0700 Subject: [PATCH 049/102] fixed hacky table cell style exception, added tests Removed the `StopIteration()` exception for handling table style and instead call `_handle_table_cell_end()` with a new parameter. Also added some random tests for table openings. --- mwparserfromhell/parser/tokenizer.py | 22 ++++++++-------------- tests/tokenizer/tables.mwtest | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 787ea0a..0de2831 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1067,24 +1067,21 @@ class Tokenizer(object): self._head += len(markup) style = None try: - (cell_context, cell) = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) + cell_context, cell, reset_for_style = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) except BadRoute: self._head = reset raise - # except for handling cell style - except StopIteration: - self._pop() + if reset_for_style: self._head = reset + len(markup) try: self._push(table_context) (style, padding) = self._parse_as_table_style("|") # Don't parse the style separator self._head += 1 - (cell_context, cell) = self._parse(table_context) + cell_context, cell, reset_for_style = self._parse(table_context) except BadRoute: self._head = reset raise - self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) if style: @@ -1132,13 +1129,10 @@ class Tokenizer(object): self._handle_tag_data(data, this) self._head += 1 - def _handle_table_cell_end(self): - """Returns the context and stack in a tuple.""" - return (self._context, self._pop()) - - def _handle_cell_style(self): - """Pop the cell off the stack and try to parse as style.""" - raise StopIteration() + def _handle_table_cell_end(self, reset_for_style=False): + """Returns the context, stack, and whether to reset the cell for style + in a tuple.""" + return self._context, self._pop(), reset_for_style def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" @@ -1316,7 +1310,7 @@ class Tokenizer(object): return self._handle_table_cell_end() self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: - self._handle_cell_style() + return self._handle_table_cell_end(reset_for_style=True) # on newline, clear out cell line contexts elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 2770227..184e695 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -33,6 +33,34 @@ output: [Text(text="{| |- ")] --- +name: no_table_close_attributes +label: Don't parse attributes as attributes if the table doesn't exist. +input: "{| border="1"" +output: [Text(text="{| border=\"1\"")] + +--- + +name: no_table_close_row_attributes +label: Don't parse row attributes as attributes if the table doesn't exist. +input: "{| |- border="1"" +output: [Text(text="{| |- border=\"1\"")] + +--- + +name: no_table_close_cell +label: Don't parse cells if the table doesn't close. +input: "{| | border="1"| test || red | foo" +output: [Text(text="{| | border=\"1\"| test || red | foo")] + +--- + +name: crazy_no_table_close +label: Lost of opened wiki syntax without closes. +input: "{{{ {{ {| Date: Wed, 16 Jul 2014 12:28:40 -0700 Subject: [PATCH 050/102] Reorder table tokenizer methods for forward declaration Make sure py tokenizer methods only call methods that have been declared earlier. Not necessary but makes it much easier to maintain/write the C tokenizer if methods are in the same order. --- mwparserfromhell/parser/tokenizer.py | 68 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 0de2831..db4a8cf 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,6 +1002,40 @@ class Tokenizer(object): self._fail_route() return self._pop() + def _parse_as_table_style(self, end_token, break_on_table_end=False): + """Parse until ``end_token`` as style attributes for a table.""" + data = _TagOpenData() + data.context = _TagOpenData.CX_ATTR_READY + while True: + this, next = self._read(), self._read(1) + can_exit = (not data.context & (data.CX_NAME) or + data.context & data.CX_NOTE_SPACE) + if this is self.END: + if self._context & contexts.TAG_ATTR: + if data.context & data.CX_QUOTED: + # Unclosed attribute quote: reset, don't die + data.context = data.CX_ATTR_VALUE + self._pop() + self._head = data.reset + continue + self._pop() + self._fail_route() + elif this == end_token and can_exit: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + if this.isspace(): + data.padding_buffer["first"] += this + return (self._pop(), data.padding_buffer["first"]) + elif break_on_table_end and this == "|" and next == "}": + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + if this.isspace(): + data.padding_buffer["first"] += this + return (self._pop(), data.padding_buffer["first"]) + else: + self._handle_tag_data(data, this) + self._head += 1 + def _handle_table_start(self): """Handle the start of a table.""" self._head += 2 @@ -1095,40 +1129,6 @@ class Tokenizer(object): # offset displacement done by _parse() self._head -= 1 - def _parse_as_table_style(self, end_token, break_on_table_end=False): - """Parse until ``end_token`` as style attributes for a table.""" - data = _TagOpenData() - data.context = _TagOpenData.CX_ATTR_READY - while True: - this, next = self._read(), self._read(1) - can_exit = (not data.context & (data.CX_NAME) or - data.context & data.CX_NOTE_SPACE) - if this is self.END: - if self._context & contexts.TAG_ATTR: - if data.context & data.CX_QUOTED: - # Unclosed attribute quote: reset, don't die - data.context = data.CX_ATTR_VALUE - self._pop() - self._head = data.reset - continue - self._pop() - self._fail_route() - elif this == end_token and can_exit: - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) - elif break_on_table_end and this == "|" and next == "}": - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) - else: - self._handle_tag_data(data, this) - self._head += 1 - def _handle_table_cell_end(self, reset_for_style=False): """Returns the context, stack, and whether to reset the cell for style in a tuple.""" From 457b2240457a7ed256c7bdf290d9672a4575f435 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 13:07:11 -0700 Subject: [PATCH 051/102] Add padding to table cell tags Padding now included on all wiki table cells. With wiki table cells that include attributes, `wiki_markup` is also included (unchanged). --- mwparserfromhell/parser/tokenizer.py | 12 +++++----- tests/tokenizer/tables.mwtest | 44 ++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index db4a8cf..c404ebb 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,7 +1002,7 @@ class Tokenizer(object): self._fail_route() return self._pop() - def _parse_as_table_style(self, end_token, break_on_table_end=False): + def _parse_as_table_style(self, end_token, break_on_table_end=False): """Parse until ``end_token`` as style attributes for a table.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY @@ -1099,7 +1099,7 @@ class Tokenizer(object): table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context reset = self._head self._head += len(markup) - style = None + rest_for_style, padding = False, "" try: cell_context, cell, reset_for_style = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) except BadRoute: @@ -1112,17 +1112,17 @@ class Tokenizer(object): (style, padding) = self._parse_as_table_style("|") # Don't parse the style separator self._head += 1 - cell_context, cell, reset_for_style = self._parse(table_context) + cell_context, cell, unused = self._parse(table_context) except BadRoute: self._head = reset raise self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) - if style: + if reset_for_style: self._emit_all(style) - self._emit(tokens.TagCloseSelfclose(wiki_markup="|")) + self._emit(tokens.TagCloseSelfclose(wiki_markup="|", padding=padding)) else: - self._emit(tokens.TagCloseSelfclose()) + self._emit(tokens.TagCloseSelfclose(padding=padding)) self._emit_all(cell) # keep header/cell line contexts self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 184e695..3f3a68d 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -113,42 +113,42 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_inline label: Multiple inline table cells. input: "{|\n | foo || bar || test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_inline label: Multiple inline header cells. input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell label: Parse text inside table but outside of a cell. input: "{|\n bar \n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -169,56 +169,70 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes label: Parse table cell style attributes. input: "{| \n | name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_empty_attributes +label: Parse table cell with style markers but no attributes. +input: "{| \n | | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_with_dash +label: Parse a situation in which a cell line looks like a row line. +input: "{|\n ||- \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|"), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. -input: "{| \n | name|="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|"), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +input: "{| \n | name|="foo bar" | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(wiki_markup="|"), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_templates label: Pipe inside attributes shouldn't be style separator. input: "{| \n | {{comment|template=baz}} | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: header_cell_attributes label: Parse header cell style attributes. input: "{| \n ! name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_cell_attributes label: Parse cell style attributes of inline cells. input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseSelfclose(wiki_markup="|"), Text(text=" markup"), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|"), Text(text=" time \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- From 8b5d6f9a3b8892ee9b05e0cf0025475e14f814e0 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 14:31:40 -0700 Subject: [PATCH 052/102] Changes to table close handling Fix problem in which fake table closes were causing a problem inside cells. Changed inline table handling to fix this. --- mwparserfromhell/parser/tokenizer.py | 29 ++++++++++++++++------------- tests/tokenizer/tables.mwtest | 28 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index c404ebb..b70e932 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1029,8 +1029,6 @@ class Tokenizer(object): elif break_on_table_end and this == "|" and next == "}": if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this return (self._pop(), data.padding_buffer["first"]) else: self._handle_tag_data(data, this) @@ -1040,13 +1038,17 @@ class Tokenizer(object): """Handle the start of a table.""" self._head += 2 reset = self._head - style = None + style, table = None, None try: self._push(contexts.TABLE_OPEN) (style, padding) = self._parse_as_table_style("\n", break_on_table_end=True) - # Have to do this in the case of inline tables - self._head += 1 if "\n" in padding else 0 - table = self._parse(contexts.TABLE_OPEN) + # continue to parse if it is NOT an inline table + if "\n" in padding: + self._head += 1 + table = self._parse(contexts.TABLE_OPEN) + else: + # close tag + self._head += 2 except BadRoute: # offset displacement done by _parse() self._head = reset - 1 @@ -1057,7 +1059,8 @@ class Tokenizer(object): if style: self._emit_all(style) self._emit(tokens.TagCloseOpen(padding=padding)) - self._emit_all(table) + if table: + self._emit_all(table) self._emit(tokens.TagOpenClose(wiki_markup="|}")) self._emit_text("table") self._emit(tokens.TagCloseClose()) @@ -1293,11 +1296,7 @@ class Tokenizer(object): else: self._emit_text("{|") elif self._context & contexts.TABLE_OPEN: - if this == "|" and next == "}": - if self._context & contexts.TABLE_CELL_OPEN: - return self._handle_table_cell_end() - return self._handle_table_end() - elif this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: + if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) @@ -1317,7 +1316,11 @@ class Tokenizer(object): self._emit_text(this) elif (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): - if this == "|" and next == "-": + if this == "|" and next == "}": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + return self._handle_table_end() + elif this == "|" and next == "-": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_row() diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 3f3a68d..e63bd11 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -89,6 +89,13 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- +name: characters_after_inline_table +label: Handle characters after an inline table close. +input: "{| |} tsta" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")] + +--- + name: leading_characters_table label: Don't parse as a table when leading characters are not newline or whitespace. input: "foo \n foo \t {|\n|}" @@ -124,6 +131,27 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- +name: table_cell_fake_close +label: Looks like a table close but is not. +input: "{|\n | |} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_more_fake_close +label: Looks like a table close but is not. +input: "{|\n || |} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_extra_close +label: Process second close as text. +input: "{| \n |} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n|}")] + +--- + name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" From 151a73e4371c26dea5b20169a3acd26ca3f7f711 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 15:03:26 -0700 Subject: [PATCH 053/102] Fix issue with incorrect table attributes Fix problem in which invalid table attributes were being parsed incorrectly. Added tests. --- mwparserfromhell/parser/tokenizer.py | 21 +++++++++------------ tests/tokenizer/tables.mwtest | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b70e932..7bfd11a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1008,9 +1008,16 @@ class Tokenizer(object): data.context = _TagOpenData.CX_ATTR_READY while True: this, next = self._read(), self._read(1) - can_exit = (not data.context & (data.CX_NAME) or + table_end = break_on_table_end and this == "|" and next == "}" + can_exit = (not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE) - if this is self.END: + if (this == end_token and can_exit) or table_end: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + if this.isspace(): + data.padding_buffer["first"] += this + return (self._pop(), data.padding_buffer["first"]) + elif this is self.END or table_end or this == end_token: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die @@ -1020,16 +1027,6 @@ class Tokenizer(object): continue self._pop() self._fail_route() - elif this == end_token and can_exit: - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) - elif break_on_table_end and this == "|" and next == "}": - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - return (self._pop(), data.padding_buffer["first"]) else: self._handle_tag_data(data, this) self._head += 1 diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index e63bd11..163579b 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -225,14 +225,14 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar" | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -274,7 +274,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar" \t \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -289,3 +289,32 @@ name: inline_table_attributes label: Correctly handle attributes in inline tables. input: "{| foo="tee bar" |}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_incorrect_attributes +label: Parse incorrect table style attributes. +input: "{| name="foo\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_unclosed_style +label: Parse unclosed and closed bold and italics inside cells. +input: "{|\n | ''foo || '''bar ||''baz''||'''test'''\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose() Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + + +--- + +name: recursion_five_hundred_opens +label: test potentially dangerous recursion: five hundred table openings, without spaces +input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" +output: [Text(text="{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|")] + +--- + +name: recursion_one_hundred_opens +label: test potentially dangerous recursion: one hundred table openings, with spaces +input: "{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|" +output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] \ No newline at end of file From e6ec5dc4de743f62889c65272448bdb1041fea29 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 18:11:12 -0700 Subject: [PATCH 054/102] Refactor methods to avoid returning tuples Various changes to avoid returning tuples - working on the C tokenizer made me realize this was a bad idea for compatability/similarity between the two. --- mwparserfromhell/parser/contexts.py | 17 ++++++++--------- mwparserfromhell/parser/tokenizer.py | 30 +++++++++++++++++++----------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 564ceca..3827708 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -164,15 +164,14 @@ FAIL_ON_EQUALS = 1 << 29 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) -TABLE_OPEN = 1 << 30 -TABLE_CELL_OPEN = 1 << 31 -TABLE_CELL_STYLE_POSSIBLE = 1 << 32 -TABLE_TD_LINE = 1 << 33 -TABLE_TH_LINE = 1 << 34 -TABLE_CELL_LINE_CONTEXTS = (TABLE_TD_LINE + TABLE_TH_LINE + - TABLE_CELL_STYLE_POSSIBLE) -TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE_POSSIBLE + - TABLE_TD_LINE + TABLE_TH_LINE) +TABLE_OPEN = 1 << 30 +TABLE_CELL_OPEN = 1 << 31 +TABLE_CELL_STYLE = 1 << 32 +TABLE_TD_LINE = 1 << 33 +TABLE_TH_LINE = 1 << 34 +TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_TD_LINE + + TABLE_TH_LINE) # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7bfd11a..7fda2d5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1016,7 +1016,7 @@ class Tokenizer(object): self._push_tag_buffer(data) if this.isspace(): data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) + return data.padding_buffer["first"] elif this is self.END or table_end or this == end_token: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: @@ -1038,7 +1038,8 @@ class Tokenizer(object): style, table = None, None try: self._push(contexts.TABLE_OPEN) - (style, padding) = self._parse_as_table_style("\n", break_on_table_end=True) + padding = self._parse_as_table_style("\n", break_on_table_end=True) + style = self._pop() # continue to parse if it is NOT an inline table if "\n" in padding: self._head += 1 @@ -1078,7 +1079,8 @@ class Tokenizer(object): if self._can_recurse(): try: self._push(contexts.TABLE_OPEN) - (style, padding) = self._parse_as_table_style("\n") + padding = self._parse_as_table_style("\n") + style = self._pop() except BadRoute: self._head = reset raise @@ -1099,9 +1101,11 @@ class Tokenizer(object): table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context reset = self._head self._head += len(markup) - rest_for_style, padding = False, "" + reset_for_style, padding = False, "" try: - cell_context, cell, reset_for_style = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) + cell_context = self._parse(table_context | contexts.TABLE_CELL_STYLE) + cell = self._pop() + reset_for_style = cell_context & contexts.TABLE_CELL_STYLE except BadRoute: self._head = reset raise @@ -1109,10 +1113,12 @@ class Tokenizer(object): self._head = reset + len(markup) try: self._push(table_context) - (style, padding) = self._parse_as_table_style("|") + padding = self._parse_as_table_style("|") + style = self._pop() # Don't parse the style separator self._head += 1 - cell_context, cell, unused = self._parse(table_context) + cell_context = self._parse(table_context) + cell = self._pop() except BadRoute: self._head = reset raise @@ -1130,9 +1136,11 @@ class Tokenizer(object): self._head -= 1 def _handle_table_cell_end(self, reset_for_style=False): - """Returns the context, stack, and whether to reset the cell for style - in a tuple.""" - return self._context, self._pop(), reset_for_style + """Returns the current context, with the TABLE_CELL_STYLE flag set if + it is necessary to reset and parse style attributes.""" + if reset_for_style: + return self._context | contexts.TABLE_CELL_STYLE + return self._context & ~contexts.TABLE_CELL_STYLE def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" @@ -1305,7 +1313,7 @@ class Tokenizer(object): if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) - elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: + elif this == "|" and self._context & contexts.TABLE_CELL_STYLE: return self._handle_table_cell_end(reset_for_style=True) # on newline, clear out cell line contexts elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: From 406dd3a157e72d3f37e80661cebc65cc544a321f Mon Sep 17 00:00:00 2001 From: David Winegar Date: Thu, 17 Jul 2014 16:07:43 -0700 Subject: [PATCH 055/102] All tokenizer end methods return a stack For C compatability, switch table cell end to return the stack. Now context is kept by using `keep_context` when calling `self._pop()`. --- mwparserfromhell/parser/contexts.py | 4 ++-- mwparserfromhell/parser/tokenizer.py | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 3827708..6dd5319 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -94,7 +94,7 @@ Local (stack-specific) contexts: * :const:`TABLE_OPEN` * :const:`TABLE_CELL_OPEN` - * :const:`TABLE_CELL_STYLE_POSSIBLE` + * :const:`TABLE_CELL_STYLE` * :const:`TABLE_TD_LINE` * :const:`TABLE_TH_LINE` * :const:`TABLE_CELL_LINE_CONTEXTS` @@ -180,7 +180,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + - STYLE + TABLE_OPEN) + STYLE + TABLE) UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7fda2d5..9e22b28 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1098,13 +1098,14 @@ class Tokenizer(object): self._head += len(markup) - 1 return - table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context + old_context = self._context reset = self._head self._head += len(markup) reset_for_style, padding = False, "" try: - cell_context = self._parse(table_context | contexts.TABLE_CELL_STYLE) - cell = self._pop() + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) + cell_context = self._context + self._context = old_context reset_for_style = cell_context & contexts.TABLE_CELL_STYLE except BadRoute: self._head = reset @@ -1112,13 +1113,14 @@ class Tokenizer(object): if reset_for_style: self._head = reset + len(markup) try: - self._push(table_context) + self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) padding = self._parse_as_table_style("|") style = self._pop() # Don't parse the style separator self._head += 1 - cell_context = self._parse(table_context) - cell = self._pop() + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) + cell_context = self._context + self._context = old_context except BadRoute: self._head = reset raise @@ -1139,8 +1141,10 @@ class Tokenizer(object): """Returns the current context, with the TABLE_CELL_STYLE flag set if it is necessary to reset and parse style attributes.""" if reset_for_style: - return self._context | contexts.TABLE_CELL_STYLE - return self._context & ~contexts.TABLE_CELL_STYLE + self._context |= contexts.TABLE_CELL_STYLE + else: + self._context &= ~contexts.TABLE_CELL_STYLE + return self._pop(keep_context=True) def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" From 2d945b30e53d41b0a4d448ddee56d1580274b7c6 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Thu, 17 Jul 2014 16:21:20 -0700 Subject: [PATCH 056/102] Use uint64_t for context For the C tokenizer, include `` and use `uint64_t` instead of `int` for context. Changes to tables mean that context can be larger than 32 bits, and it is possible for `int` to only have 16 bits anyways (though this is very unlikely). --- mwparserfromhell/parser/tokenizer.c | 29 +++++++++++++++-------------- mwparserfromhell/parser/tokenizer.h | 7 ++++--- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 814ad50..90f51b0 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) /* Add a new token stack, context, and textbuffer to the list. */ -static int Tokenizer_push(Tokenizer* self, int context) +static int Tokenizer_push(Tokenizer* self, uint64_t context) { Stack* top = malloc(sizeof(Stack)); @@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self) static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) { PyObject* stack; - int context; + uint64_t context; if (Tokenizer_push_textbuffer(self)) return NULL; @@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) */ static void* Tokenizer_fail_route(Tokenizer* self) { - int context = self->topstack->context; + uint64_t context = self->topstack->context; PyObject* stack = Tokenizer_pop(self); Py_XDECREF(stack); @@ -1034,7 +1034,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) { // Built from Tokenizer_parse()'s end sentinels: Py_UNICODE after = Tokenizer_READ(self, 2); - int ctx = self->topstack->context; + uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || this == '<' || this == '>' || (this == '\'' && next == '\'') || @@ -1629,9 +1629,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) static int Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) { - int ctx = data->context; - int end_of_value = (ctx & TAG_ATTR_VALUE && - !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); + uint64_t ctx = data->context; + uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && + !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { if (Tokenizer_push_tag_buffer(self, data)) @@ -2153,7 +2153,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag, static int Tokenizer_parse_italics(Tokenizer* self) { Py_ssize_t reset = self->head; - int context; + uint64_t context; PyObject *stack; stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); @@ -2273,7 +2273,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self) */ static PyObject* Tokenizer_parse_style(Tokenizer* self) { - int context = self->topstack->context, ticks = 2, i; + uint64_t context = self->topstack->context, ticks = 2, i; self->head += 2; while (Tokenizer_READ(self, 0) == '\'') { @@ -2428,7 +2428,7 @@ static int Tokenizer_handle_dl_term(Tokenizer* self) /* Handle the end of the stream of wikitext. */ -static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) +static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) { PyObject *token, *text, *trash; int single; @@ -2457,7 +2457,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ -static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) +static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) { if (context & LC_FAIL_NEXT) return -1; @@ -2536,9 +2536,9 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) Parse the wikicode string, using context for when to stop. If push is true, we will push a new context, otherwise we won't and context will be ignored. */ -static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) +static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) { - int this_context; + uint64_t this_context; Py_UNICODE this, next, next_next, last; PyObject* temp; @@ -2697,7 +2697,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp, *tokens; - int context = 0, skip_style_tags = 0; + uint64_t context = 0; + int skip_style_tags = 0; if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { Py_XDECREF(self->text); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index dde6464..e9b1a92 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -29,6 +29,7 @@ SOFTWARE. #include #include #include +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -191,7 +192,7 @@ struct Textbuffer { struct Stack { PyObject* stack; - int context; + uint64_t context; struct Textbuffer* textbuffer; struct Stack* next; }; @@ -202,7 +203,7 @@ typedef struct { } HeadingData; typedef struct { - int context; + uint64_t context; struct Textbuffer* pad_first; struct Textbuffer* pad_before_eq; struct Textbuffer* pad_after_eq; @@ -267,7 +268,7 @@ static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); -static PyObject* Tokenizer_parse(Tokenizer*, int, int); +static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static int load_exceptions(void); From 0128b1f78a346dbe774800bd17b1b0f92bb9ca30 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Fri, 18 Jul 2014 17:41:24 -0700 Subject: [PATCH 057/102] Implement CTokenizer for tables CTokenizer is completely implemented in this commit - it didn't make much sense to me to split it up. All tests passing, memory test shows no leaks on Linux. --- mwparserfromhell/parser/tokenizer.c | 503 ++++++++++++++++++++++++++++++++++- mwparserfromhell/parser/tokenizer.h | 108 ++++---- mwparserfromhell/parser/tokenizer.py | 2 +- 3 files changed, 551 insertions(+), 62 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 90f51b0..1d2964e 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2454,6 +2454,399 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) } /* + Parse until ``end_token`` as style attributes for a table. +*/ +static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, + int break_on_table_end) +{ + TagData *data = TagData_new(); + PyObject *padding, *trash; + Py_UNICODE this, next; + int can_exit, table_end; + + if (!data) + return NULL; + data->context = TAG_ATTR_READY; + + while (1) { + this = Tokenizer_READ(self, 0); + next = Tokenizer_READ(self, 1); + can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE); + table_end = (break_on_table_end && this == '|' && next == '}'); + if ((this == end_token && can_exit) || table_end) { + if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) { + if (Tokenizer_push_tag_buffer(self, data)) { + TagData_dealloc(data); + return NULL; + } + } + if (Py_UNICODE_ISSPACE(this)) + Textbuffer_write(&(data->pad_first), this); + padding = Textbuffer_render(data->pad_first); + TagData_dealloc(data); + if (!padding) + return NULL; + return padding; + } + else if (!this || table_end || this == end_token) { + if (self->topstack->context & LC_TAG_ATTR) { + if (data->context & TAG_QUOTED) { + // Unclosed attribute quote: reset, don't die + data->context = TAG_ATTR_VALUE; + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + self->head = data->reset; + continue; + } + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } + TagData_dealloc(data); + return Tokenizer_fail_route(self); + } + else { + if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) { + TagData_dealloc(data); + return NULL; + } + } + self->head++; + } +} + +/* + Handle the start of a table. +*/ +static int Tokenizer_handle_table_start(Tokenizer* self) +{ + self->head += 2; + Py_ssize_t reset = self->head; + PyObject *style, *open_open_kwargs, *close_open_kwargs, *open_close_kwargs, + *padding, *newline_character, *open_wiki_markup, *close_wiki_markup; + PyObject *table = NULL; + + if(Tokenizer_push(self, LC_TABLE_OPEN)) + return -1; + padding = Tokenizer_parse_as_table_style(self, '\n', 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset - 1; + if (Tokenizer_emit_text(self, "{|")) + return -1; + return 0; + } + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + + newline_character = PyUnicode_FromString("\n"); + if (!newline_character) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; + } + // continue to parse if it is NOT an inline table + if (PyUnicode_Contains(padding, newline_character)) { + Py_DECREF(newline_character); + self->head++; + table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + // offset displacement done by parse() + self->head = reset - 1; + if (Tokenizer_emit_text(self, "{|")) + return -1; + return 0; + } + if (!table) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; + } + } else { + Py_DECREF(newline_character); + // close tag + self->head += 2; + } + + open_open_kwargs = PyDict_New(); + if (!open_open_kwargs) + goto fail_decref_all; + open_wiki_markup = PyUnicode_FromString("{|"); + if (!open_wiki_markup) { + Py_DECREF(open_open_kwargs); + goto fail_decref_all; + } + PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_wiki_markup); + Py_DECREF(open_wiki_markup); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, "table")) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_padding_table; + Py_DECREF(style); + } + + close_open_kwargs = PyDict_New(); + if (!close_open_kwargs) + goto fail_decref_padding_table; + PyDict_SetItemString(close_open_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) + goto fail_decref_table; + + if (table) { + if (Tokenizer_emit_all(self, table)) + goto fail_decref_table; + Py_DECREF(table); + } + + open_close_kwargs = PyDict_New(); + if (!open_close_kwargs) + return -1; + close_wiki_markup = PyUnicode_FromString("|}"); + if (!close_wiki_markup) { + Py_DECREF(open_close_kwargs); + return -1; + } + PyDict_SetItemString(open_close_kwargs, "wiki_markup", close_wiki_markup); + Py_DECREF(close_wiki_markup); + if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) + return -1; + if (Tokenizer_emit_text(self, "table")) + return -1; + if (Tokenizer_emit(self, TagCloseClose)) + return -1; + // offset displacement done by _parse() + self->head--; + return 0; + + fail_decref_all: + Py_DECREF(style); + fail_decref_padding_table: + Py_DECREF(padding); + fail_decref_table: + Py_XDECREF(table); + return -1; +} + +/* + Return the stack in order to handle the table end. +*/ +static PyObject * Tokenizer_handle_table_end(Tokenizer* self) +{ + self->head += 2; + return Tokenizer_pop(self); +} + +/* + Parse as style until end of the line, then continue. +*/ +static int Tokenizer_handle_table_row(Tokenizer* self) +{ + Py_ssize_t reset = self->head; + self->head += 2; + PyObject *padding, *open_kwargs, *close_kwargs, *wiki_markup; + PyObject *style = NULL; + + // If we can't recurse, still tokenize tag but parse style attrs as text + if (Tokenizer_CAN_RECURSE(self)) { + if(Tokenizer_push(self, LC_TABLE_OPEN)) + return -1; + padding = Tokenizer_parse_as_table_style(self, '\n', 0); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + } else { + padding = PyUnicode_FromString(""); + if (!padding) + return -1; + } + + open_kwargs = PyDict_New(); + if (!open_kwargs) + goto fail_decref_all; + wiki_markup = PyUnicode_FromString("|-"); + if (!wiki_markup) { + Py_DECREF(open_kwargs); + goto fail_decref_all; + } + PyDict_SetItemString(open_kwargs, "wiki_markup", wiki_markup); + Py_DECREF(wiki_markup); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, "tr")) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_all; + Py_DECREF(style); + } + + close_kwargs = PyDict_New(); + if (!close_kwargs) + goto fail_decref_all; + PyDict_SetItemString(close_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs)) + return -1; + return 0; + + fail_decref_all: + Py_XDECREF(style); + Py_DECREF(padding); + return -1; +} + +/* + Parse as normal syntax unless we hit a style marker, then parse style + as HTML attributes and the remainder as normal syntax. +*/ +static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, + const char *tag, uint64_t line_context) +{ + if (!Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_emit_text(self, markup)) + return -1; + self->head += strlen(markup) - 1; + return 0; + } + + uint64_t old_context = self->topstack->context; + uint64_t cell_context; + Py_ssize_t reset = self->head; + self->head += strlen(markup); + PyObject *padding; + PyObject *cell, *open_kwargs, *close_kwargs, *open_wiki_markup, *close_wiki_markup; + PyObject *style = NULL; + + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!cell) + return -1; + cell_context = self->topstack->context; + self->topstack->context = old_context; + + if (cell_context & LC_TABLE_CELL_STYLE) { + Py_DECREF(cell); + self->head = reset + strlen(markup); + if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) + return -1; + padding = Tokenizer_parse_as_table_style(self, '|', 0); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + // Don't parse the style separator + self->head++; + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!cell) + return -1; + cell_context = self->topstack->context; + self->topstack->context = old_context; + } + else { + padding = PyUnicode_FromString(""); + if (!padding) { + Py_DECREF(cell); + return -1; + } + } + + open_kwargs = PyDict_New(); + if (!open_kwargs) + goto fail_decref_all; + close_kwargs = PyDict_New(); + if (!close_kwargs) + goto fail_decref_all; + open_wiki_markup = PyUnicode_FromString(markup); + if (!open_wiki_markup) + goto fail_decref_all; + PyDict_SetItemString(open_kwargs, "wiki_markup", open_wiki_markup); + Py_DECREF(open_wiki_markup); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, tag)) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_all; + close_wiki_markup = PyUnicode_FromString("|"); + if (!close_wiki_markup) + goto fail_decref_all; + PyDict_SetItemString(close_kwargs, "wiki_markup", close_wiki_markup); + Py_DECREF(close_wiki_markup); + Py_DECREF(style); + } + + PyDict_SetItemString(close_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs)) + goto fail_decref_cell; + if (Tokenizer_emit_all(self, cell)) + goto fail_decref_cell; + Py_DECREF(cell); + // keep header/cell line contexts + self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); + // offset displacement done by parse() + self->head--; + return 0; + + fail_decref_all: + Py_XDECREF(style); + Py_DECREF(padding); + Py_XDECREF(open_kwargs); + Py_XDECREF(close_kwargs); + fail_decref_cell: + Py_DECREF(cell); + return -1; +} + +/* + Returns the context, stack, and whether to reset the cell for style + in a tuple. +*/ +static PyObject* Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) +{ + if (reset_for_style) + self->topstack->context |= LC_TABLE_CELL_STYLE; + else + self->topstack->context &= ~LC_TABLE_CELL_STYLE; + return Tokenizer_pop_keeping_context(self); +} + +/* Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ @@ -2533,6 +2926,24 @@ static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE d } /* + Returns whether the current head has leading whitespace. + TODO: treat comments and templates as whitespace, allow fail on non-newline spaces. +*/ +static int Tokenizer_has_leading_whitespace(Tokenizer* self) +{ + int offset = 1; + Py_UNICODE current_character; + while (1) { + current_character = Tokenizer_READ_BACKWARDS(self, offset); + if (!current_character || current_character == '\n') + return 1; + else if (!Py_UNICODE_ISSPACE(current_character)) + return 0; + offset++; + } +} + +/* Parse the wikicode string, using context for when to stop. If push is true, we will push a new context, otherwise we won't and context will be ignored. */ @@ -2667,24 +3078,94 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) if (temp != Py_None) return temp; } - else if (!last || last == '\n') { - if (this == '#' || this == '*' || this == ';' || this == ':') { - if (Tokenizer_handle_list(self)) + else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) { + if (Tokenizer_handle_list(self)) + return NULL; + } + else if ((!last || last == '\n') && (this == '-' && this == next && + this == Tokenizer_READ(self, 2) && + this == Tokenizer_READ(self, 3))) { + if (Tokenizer_handle_hr(self)) + return NULL; + } + else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { + if (Tokenizer_handle_dl_term(self)) + return NULL; + // kill potential table contexts + if (this == '\n') + self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; + } + + // Start of table parsing + else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { + if (Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_handle_table_start(self)) + return NULL; + } + else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) + return NULL; + else + self->head++; + } + else if (this_context & LC_TABLE_OPEN) { + if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE)) + return NULL; + } + else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE)) return NULL; } - else if (this == '-' && this == next && - this == Tokenizer_READ(self, 2) && - this == Tokenizer_READ(self, 3)) { - if (Tokenizer_handle_hr(self)) + else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE)) + return NULL; + } + else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) { + return Tokenizer_handle_table_cell_end(self, 1); + } + // on newline, clear out cell line contexts + else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) { + self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; + if (Tokenizer_emit_char(self, this)) + return NULL; + } + else if (Tokenizer_has_leading_whitespace(self)) { + if (this == '|' && next == '}') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else + return Tokenizer_handle_table_end(self); + } + else if (this == '|' && next == '-') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_row(self)) + return NULL; + } + else if (this == '|') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE)) + return NULL; + } + else if (this == '!') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE)) + return NULL; + } + else if (Tokenizer_emit_char(self, this)) return NULL; } else if (Tokenizer_emit_char(self, this)) return NULL; } - else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { - if (Tokenizer_handle_dl_term(self)) - return NULL; - } else if (Tokenizer_emit_char(self, this)) return NULL; self->head++; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index e9b1a92..de7b7d4 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -44,9 +44,9 @@ SOFTWARE. static const char MARKERS[] = { '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', - '-', '\n', '\0'}; + '-', '!', '\n', '\0'}; -#define NUM_MARKERS 18 +#define NUM_MARKERS 19 #define TEXTBUFFER_BLOCKSIZE 1024 #define MAX_DEPTH 40 #define MAX_CYCLES 100000 @@ -110,60 +110,68 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x00000007 -#define LC_TEMPLATE_NAME 0x00000001 -#define LC_TEMPLATE_PARAM_KEY 0x00000002 -#define LC_TEMPLATE_PARAM_VALUE 0x00000004 - -#define LC_ARGUMENT 0x00000018 -#define LC_ARGUMENT_NAME 0x00000008 -#define LC_ARGUMENT_DEFAULT 0x00000010 - -#define LC_WIKILINK 0x00000060 -#define LC_WIKILINK_TITLE 0x00000020 -#define LC_WIKILINK_TEXT 0x00000040 - -#define LC_EXT_LINK 0x00000180 -#define LC_EXT_LINK_URI 0x00000080 -#define LC_EXT_LINK_TITLE 0x00000100 - -#define LC_HEADING 0x00007E00 -#define LC_HEADING_LEVEL_1 0x00000200 -#define LC_HEADING_LEVEL_2 0x00000400 -#define LC_HEADING_LEVEL_3 0x00000800 -#define LC_HEADING_LEVEL_4 0x00001000 -#define LC_HEADING_LEVEL_5 0x00002000 -#define LC_HEADING_LEVEL_6 0x00004000 - -#define LC_TAG 0x00078000 -#define LC_TAG_OPEN 0x00008000 -#define LC_TAG_ATTR 0x00010000 -#define LC_TAG_BODY 0x00020000 -#define LC_TAG_CLOSE 0x00040000 - -#define LC_STYLE 0x00780000 -#define LC_STYLE_ITALICS 0x00080000 -#define LC_STYLE_BOLD 0x00100000 -#define LC_STYLE_PASS_AGAIN 0x00200000 -#define LC_STYLE_SECOND_PASS 0x00400000 - -#define LC_DLTERM 0x00800000 - -#define LC_SAFETY_CHECK 0x3F000000 -#define LC_HAS_TEXT 0x01000000 -#define LC_FAIL_ON_TEXT 0x02000000 -#define LC_FAIL_NEXT 0x04000000 -#define LC_FAIL_ON_LBRACE 0x08000000 -#define LC_FAIL_ON_RBRACE 0x10000000 -#define LC_FAIL_ON_EQUALS 0x20000000 - +#define LC_TEMPLATE 0x0000000000000007 +#define LC_TEMPLATE_NAME 0x0000000000000001 +#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 +#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 + +#define LC_ARGUMENT 0x0000000000000018 +#define LC_ARGUMENT_NAME 0x0000000000000008 +#define LC_ARGUMENT_DEFAULT 0x0000000000000010 + +#define LC_WIKILINK 0x0000000000000060 +#define LC_WIKILINK_TITLE 0x0000000000000020 +#define LC_WIKILINK_TEXT 0x0000000000000040 + +#define LC_EXT_LINK 0x0000000000000180 +#define LC_EXT_LINK_URI 0x0000000000000080 +#define LC_EXT_LINK_TITLE 0x0000000000000100 + +#define LC_HEADING 0x0000000000007E00 +#define LC_HEADING_LEVEL_1 0x0000000000000200 +#define LC_HEADING_LEVEL_2 0x0000000000000400 +#define LC_HEADING_LEVEL_3 0x0000000000000800 +#define LC_HEADING_LEVEL_4 0x0000000000001000 +#define LC_HEADING_LEVEL_5 0x0000000000002000 +#define LC_HEADING_LEVEL_6 0x0000000000004000 + +#define LC_TAG 0x0000000000078000 +#define LC_TAG_OPEN 0x0000000000008000 +#define LC_TAG_ATTR 0x0000000000010000 +#define LC_TAG_BODY 0x0000000000020000 +#define LC_TAG_CLOSE 0x0000000000040000 + +#define LC_STYLE 0x0000000000780000 +#define LC_STYLE_ITALICS 0x0000000000080000 +#define LC_STYLE_BOLD 0x0000000000100000 +#define LC_STYLE_PASS_AGAIN 0x0000000000200000 +#define LC_STYLE_SECOND_PASS 0x0000000000400000 + +#define LC_DLTERM 0x0000000000800000 + +#define LC_SAFETY_CHECK 0x000000003F000000 +#define LC_HAS_TEXT 0x0000000001000000 +#define LC_FAIL_ON_TEXT 0x0000000002000000 +#define LC_FAIL_NEXT 0x0000000004000000 +#define LC_FAIL_ON_LBRACE 0x0000000008000000 +#define LC_FAIL_ON_RBRACE 0x0000000010000000 +#define LC_FAIL_ON_EQUALS 0x0000000020000000 + +// TODO realign all +#define LC_TABLE 0x00000007C0000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000700000000 +#define LC_TABLE_OPEN 0x0000000040000000 +#define LC_TABLE_CELL_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_STYLE 0x0000000100000000 +#define LC_TABLE_TD_LINE 0x0000000200000000 +#define LC_TABLE_TH_LINE 0x0000000400000000 /* Global contexts: */ #define GL_HEADING 0x1 /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9e22b28..e8f21c0 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1134,7 +1134,7 @@ class Tokenizer(object): self._emit_all(cell) # keep header/cell line contexts self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) - # offset displacement done by _parse() + # offset displacement done by parse() self._head -= 1 def _handle_table_cell_end(self, reset_for_style=False): From 94a9e32494fd8c3f1ce5e39a5ef1738967244ac2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 21 Jul 2014 15:51:59 -0400 Subject: [PATCH 058/102] Add missing comma to test output. --- tests/tokenizer/tables.mwtest | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 163579b..9572733 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -302,7 +302,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_fir name: table_cell_unclosed_style label: Parse unclosed and closed bold and italics inside cells. input: "{|\n | ''foo || '''bar ||''baz''||'''test'''\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose() Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -317,4 +317,4 @@ output: [Text(text="{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{| name: recursion_one_hundred_opens label: test potentially dangerous recursion: one hundred table openings, with spaces input: "{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|" -output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] \ No newline at end of file +output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] From 7bbeb6899a653cbca35c75f66edddfc6289b7564 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 10:41:34 -0700 Subject: [PATCH 059/102] Fix ordering of tag representation Self-closing wiki syntax tags have incorrectly ordered wiki syntax and padding, fixed the ordering. --- mwparserfromhell/nodes/tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index b3ea85c..c5f9d84 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -65,7 +65,7 @@ class Tag(Node): close = self.closing_wiki_markup if self.closing_wiki_markup else "" padding = self.padding if self.padding else "" if self.self_closing: - return self.wiki_markup + attrs + close + padding + return self.wiki_markup + attrs + padding + close else: return self.wiki_markup + attrs + padding + str(self.contents) + close From 64869fe84be7a5aa5b1c14f5f12c06232402ab9c Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 12:23:44 -0700 Subject: [PATCH 060/102] Remove style test Remove style test to properly implement implicit style closes later. --- tests/tokenizer/tables.mwtest | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 9572733..c684451 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -299,14 +299,6 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_fir --- -name: table_cell_unclosed_style -label: Parse unclosed and closed bold and italics inside cells. -input: "{|\n | ''foo || '''bar ||''baz''||'''test'''\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] - - ---- - name: recursion_five_hundred_opens label: test potentially dangerous recursion: five hundred table openings, without spaces input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" From 213c105666a669349dfa607a163da245df9af466 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 14:31:37 -0700 Subject: [PATCH 061/102] Table tags are no longer self-closing Table tags no longer self-closing. Rows and cells now contain their contents. Also refactored out an `emit_table_tag` method. Note: this will require changes to the Tag node and possibly the builder, those changes will be in the next commit. --- mwparserfromhell/parser/contexts.py | 9 +- mwparserfromhell/parser/tokenizer.c | 289 +++++++++++++++++------------------ mwparserfromhell/parser/tokenizer.h | 11 +- mwparserfromhell/parser/tokenizer.py | 83 +++++----- tests/tokenizer/tables.mwtest | 44 +++--- 5 files changed, 218 insertions(+), 218 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 6dd5319..ef44ce2 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -167,11 +167,12 @@ SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + TABLE_OPEN = 1 << 30 TABLE_CELL_OPEN = 1 << 31 TABLE_CELL_STYLE = 1 << 32 -TABLE_TD_LINE = 1 << 33 -TABLE_TH_LINE = 1 << 34 +TABLE_ROW_OPEN = 1 << 33 +TABLE_TD_LINE = 1 << 34 +TABLE_TH_LINE = 1 << 35 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE -TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_TD_LINE + - TABLE_TH_LINE) +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN + + TABLE_TD_LINE + TABLE_TH_LINE) # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 1d2964e..c062404 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2454,6 +2454,88 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) } /* + Emit a table tag. +*/ +static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, + const char* tag, PyObject* style, PyObject* padding, + const char* close_open_markup, PyObject* contents, + const char* open_close_markup) +{ + PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, *close_open_markup_unicode, + *open_close_kwargs, *open_close_markup_unicode; + + open_open_kwargs = PyDict_New(); + if (!open_open_kwargs) + goto fail_decref_all; + open_open_markup_unicode = PyUnicode_FromString(open_open_markup); + if (!open_open_markup_unicode) { + Py_DECREF(open_open_kwargs); + goto fail_decref_all; + } + PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_open_markup_unicode); + Py_DECREF(open_open_markup_unicode); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, tag)) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_all; + Py_DECREF(style); + } + + close_open_kwargs = PyDict_New(); + if (!close_open_kwargs) + goto fail_decref_padding_contents; + if (close_open_markup && strlen(close_open_markup) != 0) { + close_open_markup_unicode = PyUnicode_FromString(close_open_markup); + if (!close_open_markup_unicode) { + Py_DECREF(close_open_kwargs); + goto fail_decref_padding_contents; + } + PyDict_SetItemString(close_open_kwargs, "wiki_markup", close_open_markup_unicode); + Py_DECREF(close_open_markup_unicode); + } + PyDict_SetItemString(close_open_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) + goto fail_decref_contents; + + if (contents) { + if (Tokenizer_emit_all(self, contents)) + goto fail_decref_contents; + Py_DECREF(contents); + } + + open_close_kwargs = PyDict_New(); + if (!open_close_kwargs) + return -1; + open_close_markup_unicode = PyUnicode_FromString(open_close_markup); + if (!open_close_markup_unicode) { + Py_DECREF(open_close_kwargs); + return -1; + } + PyDict_SetItemString(open_close_kwargs, "wiki_markup", open_close_markup_unicode); + Py_DECREF(open_close_markup_unicode); + if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) + return -1; + if (Tokenizer_emit_text(self, tag)) + return -1; + if (Tokenizer_emit(self, TagCloseClose)) + return -1; + return 0; + + fail_decref_all: + Py_XDECREF(style); + fail_decref_padding_contents: + Py_DECREF(padding); + fail_decref_contents: + Py_DECREF(contents); + return -1; +} + +/* Parse until ``end_token`` as style attributes for a table. */ static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, @@ -2521,8 +2603,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) { self->head += 2; Py_ssize_t reset = self->head; - PyObject *style, *open_open_kwargs, *close_open_kwargs, *open_close_kwargs, - *padding, *newline_character, *open_wiki_markup, *close_wiki_markup; + PyObject *style, *padding, *newline_character; PyObject *table = NULL; if(Tokenizer_push(self, LC_TABLE_OPEN)) @@ -2573,68 +2654,11 @@ static int Tokenizer_handle_table_start(Tokenizer* self) self->head += 2; } - open_open_kwargs = PyDict_New(); - if (!open_open_kwargs) - goto fail_decref_all; - open_wiki_markup = PyUnicode_FromString("{|"); - if (!open_wiki_markup) { - Py_DECREF(open_open_kwargs); - goto fail_decref_all; - } - PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_wiki_markup); - Py_DECREF(open_wiki_markup); - if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) - goto fail_decref_all; - if (Tokenizer_emit_text(self, "table")) - goto fail_decref_all; - - if (style) { - if (Tokenizer_emit_all(self, style)) - goto fail_decref_padding_table; - Py_DECREF(style); - } - - close_open_kwargs = PyDict_New(); - if (!close_open_kwargs) - goto fail_decref_padding_table; - PyDict_SetItemString(close_open_kwargs, "padding", padding); - Py_DECREF(padding); - if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) - goto fail_decref_table; - - if (table) { - if (Tokenizer_emit_all(self, table)) - goto fail_decref_table; - Py_DECREF(table); - } - - open_close_kwargs = PyDict_New(); - if (!open_close_kwargs) - return -1; - close_wiki_markup = PyUnicode_FromString("|}"); - if (!close_wiki_markup) { - Py_DECREF(open_close_kwargs); - return -1; - } - PyDict_SetItemString(open_close_kwargs, "wiki_markup", close_wiki_markup); - Py_DECREF(close_wiki_markup); - if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) - return -1; - if (Tokenizer_emit_text(self, "table")) - return -1; - if (Tokenizer_emit(self, TagCloseClose)) + if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, table, "|}")) return -1; // offset displacement done by _parse() self->head--; return 0; - - fail_decref_all: - Py_DECREF(style); - fail_decref_padding_table: - Py_DECREF(padding); - fail_decref_table: - Py_XDECREF(table); - return -1; } /* @@ -2651,67 +2675,60 @@ static PyObject * Tokenizer_handle_table_end(Tokenizer* self) */ static int Tokenizer_handle_table_row(Tokenizer* self) { + if (!Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_emit_text(self, "|-")) + return -1; + self->head += 1; + return 0; + } + Py_ssize_t reset = self->head; self->head += 2; - PyObject *padding, *open_kwargs, *close_kwargs, *wiki_markup; - PyObject *style = NULL; + PyObject *padding, *style, *row; - // If we can't recurse, still tokenize tag but parse style attrs as text - if (Tokenizer_CAN_RECURSE(self)) { - if(Tokenizer_push(self, LC_TABLE_OPEN)) - return -1; - padding = Tokenizer_parse_as_table_style(self, '\n', 0); - if (BAD_ROUTE) { - self->head = reset; - return 0; - } - if (!padding) - return -1; - style = Tokenizer_pop(self); - if (!style) { - Py_DECREF(padding); - return -1; - } - } else { - padding = PyUnicode_FromString(""); - if (!padding) - return -1; + if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) + return -1; + padding = Tokenizer_parse_as_table_style(self, '\n', 0); + if (BAD_ROUTE) { + self->head = reset; + return 0; } - - open_kwargs = PyDict_New(); - if (!open_kwargs) - goto fail_decref_all; - wiki_markup = PyUnicode_FromString("|-"); - if (!wiki_markup) { - Py_DECREF(open_kwargs); - goto fail_decref_all; + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; } - PyDict_SetItemString(open_kwargs, "wiki_markup", wiki_markup); - Py_DECREF(wiki_markup); - if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs)) - goto fail_decref_all; - if (Tokenizer_emit_text(self, "tr")) - goto fail_decref_all; - - if (style) { - if (Tokenizer_emit_all(self, style)) - goto fail_decref_all; + // don't parse the style separator + self->head++; + row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); + if (BAD_ROUTE) { + Py_DECREF(padding); Py_DECREF(style); + self->head = reset; + return 0; + } + if (!row) { + Py_DECREF(padding); + Py_DECREF(style); + Py_DECREF(row); + return -1; } - close_kwargs = PyDict_New(); - if (!close_kwargs) - goto fail_decref_all; - PyDict_SetItemString(close_kwargs, "padding", padding); - Py_DECREF(padding); - if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs)) + if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) return -1; + // offset displacement done by _parse() + self->head--; return 0; +} - fail_decref_all: - Py_XDECREF(style); - Py_DECREF(padding); - return -1; +/* + Return the stack in order to handle the table row end. +*/ +static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) +{ + return Tokenizer_pop(self); } /* @@ -2732,9 +2749,9 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, uint64_t cell_context; Py_ssize_t reset = self->head; self->head += strlen(markup); - PyObject *padding; - PyObject *cell, *open_kwargs, *close_kwargs, *open_wiki_markup, *close_wiki_markup; + PyObject *padding, *cell; PyObject *style = NULL; + const char *close_open_markup = NULL; cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { @@ -2783,54 +2800,16 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, } } - open_kwargs = PyDict_New(); - if (!open_kwargs) - goto fail_decref_all; - close_kwargs = PyDict_New(); - if (!close_kwargs) - goto fail_decref_all; - open_wiki_markup = PyUnicode_FromString(markup); - if (!open_wiki_markup) - goto fail_decref_all; - PyDict_SetItemString(open_kwargs, "wiki_markup", open_wiki_markup); - Py_DECREF(open_wiki_markup); - if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs)) - goto fail_decref_all; - if (Tokenizer_emit_text(self, tag)) - goto fail_decref_all; - if (style) { - if (Tokenizer_emit_all(self, style)) - goto fail_decref_all; - close_wiki_markup = PyUnicode_FromString("|"); - if (!close_wiki_markup) - goto fail_decref_all; - PyDict_SetItemString(close_kwargs, "wiki_markup", close_wiki_markup); - Py_DECREF(close_wiki_markup); - Py_DECREF(style); + close_open_markup = "|"; } - - PyDict_SetItemString(close_kwargs, "padding", padding); - Py_DECREF(padding); - if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs)) - goto fail_decref_cell; - if (Tokenizer_emit_all(self, cell)) - goto fail_decref_cell; - Py_DECREF(cell); + if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, close_open_markup, cell, "")) + return -1; // keep header/cell line contexts self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); // offset displacement done by parse() self->head--; return 0; - - fail_decref_all: - Py_XDECREF(style); - Py_DECREF(padding); - Py_XDECREF(open_kwargs); - Py_XDECREF(close_kwargs); - fail_decref_cell: - Py_DECREF(cell); - return -1; } /* @@ -3139,12 +3118,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) if (this == '|' && next == '}') { if (this_context & LC_TABLE_CELL_OPEN) return Tokenizer_handle_table_cell_end(self, 0); + if (this_context & LC_TABLE_ROW_OPEN) + return Tokenizer_handle_table_row_end(self); else return Tokenizer_handle_table_end(self); } else if (this == '|' && next == '-') { if (this_context & LC_TABLE_CELL_OPEN) return Tokenizer_handle_table_cell_end(self, 0); + if (this_context & LC_TABLE_ROW_OPEN) + return Tokenizer_handle_table_row_end(self); else if (Tokenizer_handle_table_row(self)) return NULL; } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index de7b7d4..57a0121 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -157,14 +157,15 @@ static PyObject* TagCloseClose; #define LC_FAIL_ON_RBRACE 0x0000000010000000 #define LC_FAIL_ON_EQUALS 0x0000000020000000 -// TODO realign all -#define LC_TABLE 0x00000007C0000000 -#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000700000000 +#define LC_TABLE 0x0000000FC0000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 #define LC_TABLE_OPEN 0x0000000040000000 #define LC_TABLE_CELL_OPEN 0x0000000080000000 #define LC_TABLE_CELL_STYLE 0x0000000100000000 -#define LC_TABLE_TD_LINE 0x0000000200000000 -#define LC_TABLE_TH_LINE 0x0000000400000000 +#define LC_TABLE_ROW_OPEN 0x0000000200000000 +#define LC_TABLE_TD_LINE 0x0000000400000000 +#define LC_TABLE_TH_LINE 0x0000000800000000 + /* Global contexts: */ #define GL_HEADING 0x1 diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e8f21c0..6ae6050 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,6 +1002,23 @@ class Tokenizer(object): self._fail_route() return self._pop() + def _emit_table_tag(self, open_open_markup, tag, style, padding, + close_open_markup, contents, open_close_markup): + """Emit a table tag.""" + self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup)) + self._emit_text(tag) + if style: + self._emit_all(style) + if close_open_markup: + self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, padding=padding)) + else: + self._emit(tokens.TagCloseOpen(padding=padding)) + if contents: + self._emit_all(contents) + self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup)) + self._emit_text(tag) + self._emit(tokens.TagCloseClose()) + def _parse_as_table_style(self, end_token, break_on_table_end=False): """Parse until ``end_token`` as style attributes for a table.""" data = _TagOpenData() @@ -1052,17 +1069,7 @@ class Tokenizer(object): self._head = reset - 1 self._emit_text("{|") else: - self._emit(tokens.TagOpenOpen(wiki_markup="{|")) - self._emit_text("table") - if style: - self._emit_all(style) - self._emit(tokens.TagCloseOpen(padding=padding)) - if table: - self._emit_all(table) - self._emit(tokens.TagOpenClose(wiki_markup="|}")) - self._emit_text("table") - self._emit(tokens.TagCloseClose()) - # offset displacement done by _parse() + self._emit_table_tag("{|", "table", style, padding, None, table, "|}") self._head -= 1 def _handle_table_end(self): @@ -1072,23 +1079,31 @@ class Tokenizer(object): def _handle_table_row(self): """Parse as style until end of the line, then continue.""" + if not self._can_recurse(): + self._emit_text("|-") + self._head += 1 + return + reset = self._head self._head += 2 style, padding = None, "" - # If we can't recurse, still tokenize tag but parse style attrs as text - if self._can_recurse(): - try: - self._push(contexts.TABLE_OPEN) - padding = self._parse_as_table_style("\n") - style = self._pop() - except BadRoute: - self._head = reset - raise - self._emit(tokens.TagOpenOpen(wiki_markup="|-")) - self._emit_text("tr") - if style: - self._emit_all(style) - self._emit(tokens.TagCloseSelfclose(padding=padding)) + try: + self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) + padding = self._parse_as_table_style("\n") + style = self._pop() + # don't parse the style separator + self._head += 1 + row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) + except BadRoute: + self._head = reset + raise + self._emit_table_tag("|-", "tr", style, padding, None, row, "") + # offset displacement done by parse() + self._head -= 1 + + def _handle_table_row_end(self): + """Return the stack in order to handle the table row end.""" + return self._pop() def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style @@ -1101,7 +1116,7 @@ class Tokenizer(object): old_context = self._context reset = self._head self._head += len(markup) - reset_for_style, padding = False, "" + reset_for_style, padding, style = False, "", None try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) cell_context = self._context @@ -1124,14 +1139,8 @@ class Tokenizer(object): except BadRoute: self._head = reset raise - self._emit(tokens.TagOpenOpen(wiki_markup=markup)) - self._emit_text(tag) - if reset_for_style: - self._emit_all(style) - self._emit(tokens.TagCloseSelfclose(wiki_markup="|", padding=padding)) - else: - self._emit(tokens.TagCloseSelfclose(padding=padding)) - self._emit_all(cell) + close_open_markup = "|" if reset_for_style else None + self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") # keep header/cell line contexts self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) # offset displacement done by parse() @@ -1140,6 +1149,8 @@ class Tokenizer(object): def _handle_table_cell_end(self, reset_for_style=False): """Returns the current context, with the TABLE_CELL_STYLE flag set if it is necessary to reset and parse style attributes.""" + if self._context & (contexts.FAIL & ~contexts.TABLE): + raise BadRoute if reset_for_style: self._context |= contexts.TABLE_CELL_STYLE else: @@ -1328,10 +1339,14 @@ class Tokenizer(object): if this == "|" and next == "}": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() + if self._context & contexts.TABLE_ROW_OPEN: + return self._handle_table_row_end() return self._handle_table_end() elif this == "|" and next == "-": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() + if self._context & contexts.TABLE_ROW_OPEN: + return self._handle_table_row_end() self._handle_table_row() elif this == "|": if self._context & contexts.TABLE_CELL_OPEN: diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index c684451..455da67 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -106,42 +106,42 @@ output: [Text(text="foo \n foo \t {|\n|}")] name: table_row_simple label: Simple table row. input: "{|\n |- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Tag Text(text="table"), TagCloseClose()] --- name: table_row_multiple label: Simple table row. input: "{|\n |- \n|- \n |-\n |}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_inline label: Multiple inline table cells. input: "{|\n | foo || bar || test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_fake_close label: Looks like a table close but is not. input: "{|\n | |} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_more_fake_close label: Looks like a table close but is not. input: "{|\n || |} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -155,28 +155,28 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_inline label: Multiple inline header cells. input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell label: Parse text inside table but outside of a cell. input: "{|\n bar \n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -197,84 +197,84 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes label: Parse table cell style attributes. input: "{| \n | name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_empty_attributes label: Parse table cell with style markers but no attributes. input: "{| \n | | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_with_dash label: Parse a situation in which a cell line looks like a row line. input: "{|\n ||- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar" | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_templates label: Pipe inside attributes shouldn't be style separator. input: "{| \n | {{comment|template=baz}} | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: header_cell_attributes label: Parse header cell style attributes. input: "{| \n ! name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_cell_attributes label: Parse cell style attributes of inline cells. input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes label: Parse table row style attributes. input: "{| \n |- name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar" \t \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- From 1b3e3c365704bed8b0b9d8601c9ca5cbe8e7e0f6 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 15:17:51 -0700 Subject: [PATCH 062/102] Change wiki tags to use style separators For wiki syntax tables, add `wiki_style_separator` as an attribute for the Tag node. Also reorder `closing_wiki_markup` property and tests to match its place in the constructor. --- mwparserfromhell/nodes/tag.py | 78 +++++++++++++++++++++++--------------- mwparserfromhell/parser/builder.py | 6 ++- tests/test_tag.py | 40 ++++++++++++------- 3 files changed, 79 insertions(+), 45 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index c5f9d84..e9531e7 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -35,7 +35,8 @@ class Tag(Node): def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, self_closing=False, invalid=False, implicit=False, padding="", - closing_tag=None, closing_wiki_markup=None): + closing_tag=None, wiki_style_separator=None, + closing_wiki_markup=None): super(Tag, self).__init__() self._tag = tag if contents is None and not self_closing: @@ -44,12 +45,6 @@ class Tag(Node): self._contents = contents self._attrs = attrs if attrs else [] self._wiki_markup = wiki_markup - if closing_wiki_markup: - self._closing_wiki_markup = closing_wiki_markup - elif wiki_markup and not self_closing: - self._closing_wiki_markup = wiki_markup - else: - self._closing_wiki_markup = None self._self_closing = self_closing self._invalid = invalid self._implicit = implicit @@ -58,16 +53,28 @@ class Tag(Node): self._closing_tag = closing_tag else: self._closing_tag = tag + self._wiki_style_separator = wiki_style_separator + if closing_wiki_markup is not None: + self._closing_wiki_markup = closing_wiki_markup + elif wiki_markup and not self_closing: + self._closing_wiki_markup = wiki_markup + else: + self._closing_wiki_markup = None def __unicode__(self): if self.wiki_markup: - attrs = "".join([str(attr) for attr in self.attributes]) if self.attributes else "" - close = self.closing_wiki_markup if self.closing_wiki_markup else "" - padding = self.padding if self.padding else "" + if self.attributes: + attrs = "".join([str(attr) for attr in self.attributes]) + else: + attrs = "" + padding = self.padding or "" + separator = self.wiki_style_separator or "" + close = self.closing_wiki_markup or "" if self.self_closing: - return self.wiki_markup + attrs + padding + close + return self.wiki_markup + attrs + padding + separator else: - return self.wiki_markup + attrs + padding + str(self.contents) + close + return self.wiki_markup + attrs + padding + separator + \ + str(self.contents) + close result = ("``).""" return self._self_closing @@ -197,6 +190,27 @@ class Tag(Node): """ return self._closing_tag + @property + def wiki_style_separator(self): + """The separator between the padding and content in a wiki markup tag. + + Essentially the wiki equivalent of the TagCloseOpen. + """ + return self._wiki_style_separator + + @property + def closing_wiki_markup(self): + """The wikified version of the closing tag to show instead of HTML. + + If set to a value, this will be displayed instead of the close tag + brackets. If tag is :attr:`self_closing` is ``True`` then this is not + displayed. If :attr:`wiki_markup` is set and this has not been set, this + is set to the value of :attr:`wiki_markup`. If this has been set and + :attr:`wiki_markup` is set to a ``False`` value, this is set to + ``None``. + """ + return self._closing_wiki_markup + @tag.setter def tag(self, value): self._tag = self._closing_tag = parse_anything(value) @@ -211,10 +225,6 @@ class Tag(Node): if not value or not self.closing_wiki_markup: self.closing_wiki_markup = str(value) if value else None - @closing_wiki_markup.setter - def closing_wiki_markup(self, value): - self._closing_wiki_markup = str(value) if value else None - @self_closing.setter def self_closing(self, value): self._self_closing = bool(value) @@ -241,6 +251,14 @@ class Tag(Node): def closing_tag(self, value): self._closing_tag = parse_anything(value) + @wiki_style_separator.setter + def wiki_style_separator(self, value): + self._wiki_style_separator = str(value) if value else None + + @closing_wiki_markup.setter + def closing_wiki_markup(self, value): + self._closing_wiki_markup = str(value) if value else None + def has(self, name): """Return whether any attribute in the tag has the given *name*. diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 32cbb93..99a54d1 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -248,13 +248,14 @@ class Builder(object): close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) implicit, attrs, contents, closing_tag = False, [], None, None wiki_markup, invalid = token.wiki_markup, token.invalid or False - closing_wiki_markup = None + wiki_style_separator, closing_wiki_markup = None, wiki_markup self._push() while self._tokens: token = self._tokens.pop() if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute(token)) elif isinstance(token, tokens.TagCloseOpen): + wiki_style_separator = token.wiki_markup padding = token.padding or "" tag = self._pop() self._push() @@ -273,7 +274,8 @@ class Builder(object): self_closing = False closing_tag = self._pop() return Tag(tag, contents, attrs, wiki_markup, self_closing, - invalid, implicit, padding, closing_tag, closing_wiki_markup) + invalid, implicit, padding, closing_tag, + wiki_style_separator, closing_wiki_markup) else: self._write(self._handle_token(token)) raise ParserError("_handle_tag() missed a close token") diff --git a/tests/test_tag.py b/tests/test_tag.py index 2d67723..c2c751b 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -171,19 +171,6 @@ class TestTag(TreeEqualityTestCase): self.assertFalse(node.wiki_markup) self.assertEqual("italic text", node) - def test_closing_wiki_markup(self): - """test getter/setter behavior for closing_wiki_markup attribute""" - node = Tag(wraptext("table"), wraptext("\n")) - self.assertIs(None, node.closing_wiki_markup) - node.wiki_markup = "{|" - self.assertEqual("{|", node.closing_wiki_markup) - node.closing_wiki_markup = "|}" - self.assertEqual("|}", node.closing_wiki_markup) - self.assertEqual("{|\n|}", node) - node.wiki_markup = False - self.assertFalse(node.closing_wiki_markup) - self.assertEqual("\n
    ", node) - def test_self_closing(self): """test getter/setter for the self_closing attribute""" node = Tag(wraptext("ref"), wraptext("foobar")) @@ -239,6 +226,33 @@ class TestTag(TreeEqualityTestCase): self.assertWikicodeEqual(parsed, node.closing_tag) self.assertEqual("foobar", node) + def test_wiki_style_separator(self): + """test getter/setter for wiki_style_separator attribute""" + node = Tag(wraptext("table"), wraptext("\n")) + self.assertIs(None, node.wiki_style_separator) + node.wiki_style_separator = "|" + self.assertEqual("|", node.wiki_style_separator) + node.wiki_markup = "{" + self.assertEqual("{|\n{", node) + node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|") + self.assertEqual("|", node.wiki_style_separator) + + def test_closing_wiki_markup(self): + """test getter/setter for closing_wiki_markup attribute""" + node = Tag(wraptext("table"), wraptext("\n")) + self.assertIs(None, node.closing_wiki_markup) + node.wiki_markup = "{|" + self.assertEqual("{|", node.closing_wiki_markup) + node.closing_wiki_markup = "|}" + self.assertEqual("|}", node.closing_wiki_markup) + self.assertEqual("{|\n|}", node) + node.wiki_markup = False + self.assertFalse(node.closing_wiki_markup) + self.assertEqual("\n
    ", node) + node2 = Tag(wraptext("table"), wraptext("\n"), wiki_markup="{|", + closing_wiki_markup="|}") + self.assertEqual("|}", node2.closing_wiki_markup) + def test_has(self): """test Tag.has()""" node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) From c63108039b4bb56348bd54ba0b59fe77c5f19eec Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 16:01:32 -0700 Subject: [PATCH 063/102] Fix C code to make declarations before statements Python 3.4 compiles C extensions with the `-Werror=declaration-after-statement` flag that enforces C90 more strictly than previous versions. Move all statements after declarations to make sure this extension builds on 3.4. --- mwparserfromhell/parser/tokenizer.c | 34 +++++++++++++++++----------------- mwparserfromhell/parser/tokenizer.py | 26 ++++++++++++-------------- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c062404..c902c3d 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2601,17 +2601,17 @@ static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, */ static int Tokenizer_handle_table_start(Tokenizer* self) { - self->head += 2; - Py_ssize_t reset = self->head; + Py_ssize_t reset = self->head + 1; PyObject *style, *padding, *newline_character; PyObject *table = NULL; + self->head += 2; if(Tokenizer_push(self, LC_TABLE_OPEN)) return -1; padding = Tokenizer_parse_as_table_style(self, '\n', 1); if (BAD_ROUTE) { RESET_ROUTE(); - self->head = reset - 1; + self->head = reset; if (Tokenizer_emit_text(self, "{|")) return -1; return 0; @@ -2638,7 +2638,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) if (BAD_ROUTE) { RESET_ROUTE(); // offset displacement done by parse() - self->head = reset - 1; + self->head = reset; if (Tokenizer_emit_text(self, "{|")) return -1; return 0; @@ -2675,17 +2675,17 @@ static PyObject * Tokenizer_handle_table_end(Tokenizer* self) */ static int Tokenizer_handle_table_row(Tokenizer* self) { + Py_ssize_t reset = self->head; + PyObject *padding, *style, *row; + self->head += 2; + if (!Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_emit_text(self, "|-")) return -1; - self->head += 1; + self->head -= 1; return 0; } - Py_ssize_t reset = self->head; - self->head += 2; - PyObject *padding, *style, *row; - if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; padding = Tokenizer_parse_as_table_style(self, '\n', 0); @@ -2738,20 +2738,20 @@ static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, const char *tag, uint64_t line_context) { - if (!Tokenizer_CAN_RECURSE(self)) { - if (Tokenizer_emit_text(self, markup)) - return -1; - self->head += strlen(markup) - 1; - return 0; - } - uint64_t old_context = self->topstack->context; uint64_t cell_context; Py_ssize_t reset = self->head; - self->head += strlen(markup); PyObject *padding, *cell; PyObject *style = NULL; const char *close_open_markup = NULL; + self->head += strlen(markup); + + if (!Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_emit_text(self, markup)) + return -1; + self->head--; + return 0; + } cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 6ae6050..59f2156 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1050,9 +1050,9 @@ class Tokenizer(object): def _handle_table_start(self): """Handle the start of a table.""" - self._head += 2 - reset = self._head + reset = self._head + 1 style, table = None, None + self._head += 2 try: self._push(contexts.TABLE_OPEN) padding = self._parse_as_table_style("\n", break_on_table_end=True) @@ -1066,7 +1066,7 @@ class Tokenizer(object): self._head += 2 except BadRoute: # offset displacement done by _parse() - self._head = reset - 1 + self._head = reset self._emit_text("{|") else: self._emit_table_tag("{|", "table", style, padding, None, table, "|}") @@ -1079,14 +1079,14 @@ class Tokenizer(object): def _handle_table_row(self): """Parse as style until end of the line, then continue.""" + reset = self._head + style, padding = None, "" + self._head += 2 if not self._can_recurse(): self._emit_text("|-") - self._head += 1 + self._head -= 1 return - reset = self._head - self._head += 2 - style, padding = None, "" try: self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) padding = self._parse_as_table_style("\n") @@ -1108,15 +1108,15 @@ class Tokenizer(object): def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax.""" + old_context = self._context + reset = self._head + reset_for_style, padding, style = False, "", None + self._head += len(markup) if not self._can_recurse(): self._emit_text(markup) - self._head += len(markup) - 1 + self._head -= 1 return - old_context = self._context - reset = self._head - self._head += len(markup) - reset_for_style, padding, style = False, "", None try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) cell_context = self._context @@ -1149,8 +1149,6 @@ class Tokenizer(object): def _handle_table_cell_end(self, reset_for_style=False): """Returns the current context, with the TABLE_CELL_STYLE flag set if it is necessary to reset and parse style attributes.""" - if self._context & (contexts.FAIL & ~contexts.TABLE): - raise BadRoute if reset_for_style: self._context |= contexts.TABLE_CELL_STYLE else: From 8dc70bc20b4f4f0926db267ed4430ff175bcb37b Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 16:31:56 -0700 Subject: [PATCH 064/102] Add test coverage Add some table tests to increase coverage. Also reorder some tests. --- tests/test_tag.py | 4 +++- tests/tokenizer/tables.mwtest | 51 ++++++++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/tests/test_tag.py b/tests/test_tag.py index c2c751b..b33b0c2 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -249,9 +249,11 @@ class TestTag(TreeEqualityTestCase): node.wiki_markup = False self.assertFalse(node.closing_wiki_markup) self.assertEqual("\n
    ", node) - node2 = Tag(wraptext("table"), wraptext("\n"), wiki_markup="{|", + node2 = Tag(wraptext("table"), wraptext("\n"), + attrs=[agen("id", "foo")], wiki_markup="{|", closing_wiki_markup="|}") self.assertEqual("|}", node2.closing_wiki_markup) + self.assertEqual('{| id="foo"\n|}', node2) def test_has(self): """test Tag.has()""" diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 455da67..39acf0c 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -106,7 +106,7 @@ output: [Text(text="foo \n foo \t {|\n|}")] name: table_row_simple label: Simple table row. input: "{|\n |- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Tag Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -131,6 +131,41 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- +name: table_cell_multiple +label: Multiple table cells (non-inline). +input: "{|\n| foo \n| bar \n| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_header_simple +label: Simple header cell. +input: "{|\n ! foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_header_inline +label: Multiple inline header cells. +input: "{|\n ! foo || bar !! test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_header_multiple +label: Multiple table header cells (non-inline). +input: "{|\n! foo \n! bar \n! test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: nested_cells_and_rows +label: Combination of cells and rows in a table. +input: "{|\n|- \n| foo \n|- \n| bar\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + name: table_cell_fake_close label: Looks like a table close but is not. input: "{|\n | |} \n|}" @@ -152,20 +187,6 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- -name: table_header_simple -label: Simple header cell. -input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] - ---- - -name: table_header_inline -label: Multiple inline header cells. -input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] - ---- - name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" From c802b1f8143018e8d014c682eb98c14d11b06c54 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Fri, 25 Jul 2014 15:53:35 -0700 Subject: [PATCH 065/102] Change context to uint64_t One-line fix --- mwparserfromhell/parser/tokenizer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 57a0121..8d2d428 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -53,7 +53,8 @@ static const char MARKERS[] = { #define MAX_BRACES 255 #define MAX_ENTITY_SIZE 8 -static int route_state = 0, route_context = 0; +static int route_state = 0; +static uint64_t route_context = 0; #define BAD_ROUTE route_state #define BAD_ROUTE_CONTEXT route_context #define FAIL_ROUTE(context) route_state = 1; route_context = context From 1a4c88e11f8b6403e4a15a1e24b67b3185c884c6 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Fri, 25 Jul 2014 15:54:37 -0700 Subject: [PATCH 066/102] Correctly handle no table endings Tests were not correctly testing the situations without a table close. Fixed tests and then fixed tokenizers for failing tests. Also refactored pytokenizer to more closely match the ctokenizer by only holding the `_parse` methods in the try blocks and no other code. --- mwparserfromhell/parser/tokenizer.c | 28 ++++++++++++--- mwparserfromhell/parser/tokenizer.py | 70 +++++++++++++++++++++++------------- tests/tokenizer/tables.mwtest | 49 +++++++++++++++++++++---- 3 files changed, 110 insertions(+), 37 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c902c3d..bad72ef 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2636,8 +2636,9 @@ static int Tokenizer_handle_table_start(Tokenizer* self) self->head++; table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); if (BAD_ROUTE) { + Py_DECREF(padding); + Py_DECREF(style); RESET_ROUTE(); - // offset displacement done by parse() self->head = reset; if (Tokenizer_emit_text(self, "{|")) return -1; @@ -2676,7 +2677,7 @@ static PyObject * Tokenizer_handle_table_end(Tokenizer* self) static int Tokenizer_handle_table_row(Tokenizer* self) { Py_ssize_t reset = self->head; - PyObject *padding, *style, *row; + PyObject *padding, *style, *row, *trash; self->head += 2; if (!Tokenizer_CAN_RECURSE(self)) { @@ -2690,6 +2691,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return -1; padding = Tokenizer_parse_as_table_style(self, '\n', 0); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2704,6 +2707,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) self->head++; row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); Py_DECREF(padding); Py_DECREF(style); self->head = reset; @@ -2712,7 +2717,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) if (!row) { Py_DECREF(padding); Py_DECREF(style); - Py_DECREF(row); return -1; } @@ -2741,7 +2745,7 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, uint64_t old_context = self->topstack->context; uint64_t cell_context; Py_ssize_t reset = self->head; - PyObject *padding, *cell; + PyObject *padding, *cell, *trash; PyObject *style = NULL; const char *close_open_markup = NULL; self->head += strlen(markup); @@ -2755,6 +2759,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2770,6 +2776,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, return -1; padding = Tokenizer_parse_as_table_style(self, '|', 0); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2784,11 +2792,18 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, self->head++; cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); if (BAD_ROUTE) { + Py_DECREF(padding); + Py_DECREF(style); + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } - if (!cell) + if (!cell) { + Py_DECREF(padding); + Py_DECREF(style); return -1; + } cell_context = self->topstack->context; self->topstack->context = old_context; } @@ -3148,6 +3163,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) } else if (Tokenizer_emit_char(self, this)) return NULL; + // Raise BadRoute to table start + if (BAD_ROUTE) + return NULL; } else if (Tokenizer_emit_char(self, this)) return NULL; diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 59f2156..527d364 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1053,24 +1053,30 @@ class Tokenizer(object): reset = self._head + 1 style, table = None, None self._head += 2 + + self._push(contexts.TABLE_OPEN) try: - self._push(contexts.TABLE_OPEN) padding = self._parse_as_table_style("\n", break_on_table_end=True) - style = self._pop() - # continue to parse if it is NOT an inline table - if "\n" in padding: - self._head += 1 - table = self._parse(contexts.TABLE_OPEN) - else: - # close tag - self._head += 2 except BadRoute: - # offset displacement done by _parse() self._head = reset self._emit_text("{|") + return + style = self._pop() + # continue to parse if it is NOT an inline table + if "\n" in padding: + self._head += 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + return else: - self._emit_table_tag("{|", "table", style, padding, None, table, "|}") - self._head -= 1 + # close tag + self._head += 2 + self._emit_table_tag("{|", "table", style, padding, None, table, "|}") + # offset displacement done by _parse() + self._head -= 1 def _handle_table_end(self): """Return the stack in order to handle the table end.""" @@ -1087,15 +1093,21 @@ class Tokenizer(object): self._head -= 1 return + self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) try: - self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) padding = self._parse_as_table_style("\n") - style = self._pop() - # don't parse the style separator - self._head += 1 + except BadRoute: + self._head = reset + self._pop() + raise + style = self._pop() + # don't parse the style separator + self._head += 1 + try: row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) except BadRoute: self._head = reset + self._pop() raise self._emit_table_tag("|-", "tr", style, padding, None, row, "") # offset displacement done by parse() @@ -1119,26 +1131,34 @@ class Tokenizer(object): try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) - cell_context = self._context - self._context = old_context - reset_for_style = cell_context & contexts.TABLE_CELL_STYLE except BadRoute: self._head = reset + self._pop() raise + cell_context = self._context + self._context = old_context + reset_for_style = cell_context & contexts.TABLE_CELL_STYLE if reset_for_style: self._head = reset + len(markup) + self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) try: - self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) padding = self._parse_as_table_style("|") - style = self._pop() - # Don't parse the style separator - self._head += 1 + except BadRoute: + self._head = reset + self._pop() + raise + style = self._pop() + # Don't parse the style separator + self._head += 1 + try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) - cell_context = self._context - self._context = old_context except BadRoute: self._head = reset + ret = self._pop() raise + cell_context = self._context + self._context = old_context + close_open_markup = "|" if reset_for_style else None self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") # keep header/cell line contexts diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 39acf0c..ecace32 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -13,23 +13,51 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- name: no_table_close_simple -label: Handle case when there is no table close. +label: No table close on inline table input: "{| " output: [Text(text="{| ")] --- +name: no_table_close_newline +label: No table close with a newline +input: "{| \n " +output: [Text(text="{| \n ")] + +--- + name: no_table_close_inside_cell -label: Handle case when there is no table close while inside of a cell. -input: "{| | " -output: [Text(text="{| | ")] +label: No table close while inside of a cell +input: "{| \n| " +output: [Text(text="{| \n| ")] + +--- + +name: no_table_close_inside_cell_after_newline +label: No table close while inside of a cell after a newline +input: "{| \n| \n " +output: [Text(text="{| \n| \n ")] + +--- + +name: no_table_close_inside_cell_with_attributes +label: No table close while inside of a cell with attributes +input: "{| \n| red | test" +output: [Text(text="{| \n| red | test")] --- name: no_table_close_inside_row -label: Handle case when there is no table close while inside of a row. -input: "{| |- " -output: [Text(text="{| |- ")] +label: No table close while inside of a row +input: "{| \n|- " +output: [Text(text="{| \n|- ")] + +--- + +name: no_table_close_inside_row_after_newline +label: No table close while inside of a row after a newline +input: "{| \n|- \n " +output: [Text(text="{| \n|- \n ")] --- @@ -40,6 +68,13 @@ output: [Text(text="{| border=\"1\"")] --- +name: no_table_close_unclosed_attributes +label: Don't parse unclosed attributes if the table doesn't exist. +input: "{| border=" +output: [Text(text="{| border=")] + +--- + name: no_table_close_row_attributes label: Don't parse row attributes as attributes if the table doesn't exist. input: "{| |- border="1"" From cb875ae347f0b746d99159fa7edb235006145fbd Mon Sep 17 00:00:00 2001 From: Kunal Mehta Date: Sat, 2 Aug 2014 19:30:20 -0700 Subject: [PATCH 067/102] Force opening README.rst as utf-8 Causes issues if the locale is not set to utf-8 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d488650..68943ac 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ from setuptools import setup, find_packages, Extension from mwparserfromhell import __version__ from mwparserfromhell.compat import py26, py3k -with open("README.rst") as fp: +with open("README.rst", **{'encoding':'utf-8'} if py3k else {}) as fp: long_docs = fp.read() tokenizer = Extension("mwparserfromhell.parser._tokenizer", From 810c24e123c3adf67cd09f7bfe5a356305917612 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 01:11:46 -0500 Subject: [PATCH 068/102] Don't check quotes when attributes are built from tokens. --- mwparserfromhell/nodes/extras/attribute.py | 4 ++-- mwparserfromhell/parser/builder.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index cb50194..7d296dc 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -37,9 +37,9 @@ class Attribute(StringMixIn): """ def __init__(self, name, value=None, quotes='"', pad_first=" ", - pad_before_eq="", pad_after_eq=""): + pad_before_eq="", pad_after_eq="", check_quotes=True): super(Attribute, self).__init__() - if not quotes and self._value_needs_quotes(value): + if check_quotes and not quotes and self._value_needs_quotes(value): raise ValueError("given value {0!r} requires quotes".format(value)) self._name = name self._value = value diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 2d68036..97123f7 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -237,7 +237,8 @@ class Builder(object): else: name, value = self._pop(), None return Attribute(name, value, quotes, start.pad_first, - start.pad_before_eq, start.pad_after_eq) + start.pad_before_eq, start.pad_after_eq, + check_quotes=False) else: self._write(self._handle_token(token)) raise ParserError("_handle_attribute() missed a close token") From e446c51347f061670e78d47840a34c1028317798 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 01:51:44 -0500 Subject: [PATCH 069/102] Adjust table test labels for consistency. --- tests/tokenizer/tables.mwtest | 102 +++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index ecace32..b411045 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -1,355 +1,355 @@ name: empty_table -label: Parsing an empty table. +label: parsing an empty table input: "{|\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_table -label: Correctly handle tables with close on the same line. +label: correctly handle tables with close on the same line input: "{||}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=""), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_close_simple -label: No table close on inline table +label: no table close on inline table input: "{| " output: [Text(text="{| ")] --- name: no_table_close_newline -label: No table close with a newline +label: no table close with a newline input: "{| \n " output: [Text(text="{| \n ")] --- name: no_table_close_inside_cell -label: No table close while inside of a cell +label: no table close while inside of a cell input: "{| \n| " output: [Text(text="{| \n| ")] --- name: no_table_close_inside_cell_after_newline -label: No table close while inside of a cell after a newline +label: no table close while inside of a cell after a newline input: "{| \n| \n " output: [Text(text="{| \n| \n ")] --- name: no_table_close_inside_cell_with_attributes -label: No table close while inside of a cell with attributes +label: no table close while inside of a cell with attributes input: "{| \n| red | test" output: [Text(text="{| \n| red | test")] --- name: no_table_close_inside_row -label: No table close while inside of a row +label: no table close while inside of a row input: "{| \n|- " output: [Text(text="{| \n|- ")] --- name: no_table_close_inside_row_after_newline -label: No table close while inside of a row after a newline +label: no table close while inside of a row after a newline input: "{| \n|- \n " output: [Text(text="{| \n|- \n ")] --- name: no_table_close_attributes -label: Don't parse attributes as attributes if the table doesn't exist. +label: don't parse attributes as attributes if the table doesn't exist input: "{| border="1"" output: [Text(text="{| border=\"1\"")] --- name: no_table_close_unclosed_attributes -label: Don't parse unclosed attributes if the table doesn't exist. +label: don't parse unclosed attributes if the table doesn't exist input: "{| border=" output: [Text(text="{| border=")] --- name: no_table_close_row_attributes -label: Don't parse row attributes as attributes if the table doesn't exist. +label: don't parse row attributes as attributes if the table doesn't exist input: "{| |- border="1"" output: [Text(text="{| |- border=\"1\"")] --- name: no_table_close_cell -label: Don't parse cells if the table doesn't close. +label: don't parse cells if the table doesn't close input: "{| | border="1"| test || red | foo" output: [Text(text="{| | border=\"1\"| test || red | foo")] --- name: crazy_no_table_close -label: Lost of opened wiki syntax without closes. +label: lost of opened wiki syntax without closes input: "{{{ {{ {| | |- {| |} || ! !! bar \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell -label: Parse text inside table but outside of a cell. +label: parse text inside table but outside of a cell input: "{|\n bar \n | foo \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_cell_with_leading_characters -label: Fail to create a table cell when there are leading non-whitespace characters. +label: fail to create a table cell when there are leading non-whitespace characters input: "{|\n bar | foo \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_row_with_leading_characters -label: Fail to create a table row when there are leading non-whitespace characters. +label: fail to create a table row when there are leading non-whitespace characters input: "{|\n bar |- foo \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: template_inside_table_cell -label: Template within table cell. +label: template within table cell input: "{|\n |{{foo\n|bar=baz}} \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes -label: Parse table cell style attributes. +label: parse table cell style attributes input: "{| \n | name="foo bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_empty_attributes -label: Parse table cell with style markers but no attributes. +label: parse table cell with style markers but no attributes input: "{| \n | | test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_with_dash -label: Parse a situation in which a cell line looks like a row line. +label: parse a situation in which a cell line looks like a row line input: "{|\n ||- \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe -label: Pipe inside an attribute quote should still be used as a style separator. +label: pipe inside an attribute quote should still be used as a style separator input: "{| \n | name="foo|bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe -label: Pipe inside an attribute name should still be used as a style separator. +label: pipe inside an attribute name should still be used as a style separator input: "{| \n | name|="foo bar" | test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals -label: Pipe inside an attribute should still be used as a style separator after an equals. +label: pipe inside an attribute should still be used as a style separator after an equals input: "{| \n | name=|"foo|bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_templates -label: Pipe inside attributes shouldn't be style separator. +label: pipe inside attributes shouldn't be style separator input: "{| \n | {{comment|template=baz}} | test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: header_cell_attributes -label: Parse header cell style attributes. +label: parse header cell style attributes input: "{| \n ! name="foo bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_cell_attributes -label: Parse cell style attributes of inline cells. +label: parse cell style attributes of inline cells input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes -label: Parse table row style attributes. +label: parse table row style attributes input: "{| \n |- name="foo bar"\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes_crazy_whitespace -label: Parse table row style attributes with different whitespace. +label: parse table row style attributes with different whitespace input: "{| \t \n |- \t name="foo bar" \t \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_attributes -label: Parse table style attributes. +label: parse table style attributes input: "{| name="foo bar"\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_table_attributes -label: Correctly handle attributes in inline tables. +label: correctly handle attributes in inline tables input: "{| foo="tee bar" |}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_incorrect_attributes -label: Parse incorrect table style attributes. +label: parse incorrect table style attributes input: "{| name="foo\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] From b7c46a6dca5ed71326a7a8e9c3f7071a9297524b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 20:44:57 -0500 Subject: [PATCH 070/102] Add tables to changelog. --- CHANGELOG | 1 + docs/changelog.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index b4b01d6..9c05482 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ v0.4 (unreleased): - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. +- Added support for parsing wikicode tables. - Added a script to test for memory leaks in scripts/memtest.py. - Added a script to do releases in scripts/release.sh. - skip_style_tags can now be passed to mwparserfromhell.parse() (previously, diff --git a/docs/changelog.rst b/docs/changelog.rst index 9fdfef2..1854fa0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,7 @@ Unreleased - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. +- Added support for parsing wikicode tables. - Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to do releases in :file:`scripts/release.sh`. - *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() From bd85805f8fc693b8c4b2b32f700b74d4eb4e774b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 20:49:16 -0500 Subject: [PATCH 071/102] Add integration tests for token roundtripping. --- tests/_test_tokenizer.py | 11 ++++++++--- tests/test_roundtripping.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 tests/test_roundtripping.py diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index bfd4857..e44280b 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -25,8 +25,9 @@ import codecs from os import listdir, path import sys -from mwparserfromhell.compat import py3k +from mwparserfromhell.compat import py3k, str from mwparserfromhell.parser import tokens +from mwparserfromhell.parser.builder import Builder class _TestParseError(Exception): """Raised internally when a test could not be parsed.""" @@ -50,8 +51,12 @@ class TokenizerTestCase(object): *label* for the method's docstring. """ def inner(self): - expected = data["output"] - actual = self.tokenizer().tokenize(data["input"]) + if hasattr(self, "roundtrip"): + expected = data["input"] + actual = str(Builder().build(data["output"])) + else: + expected = data["output"] + actual = self.tokenizer().tokenize(data["input"]) self.assertEqual(expected, actual) if not py3k: inner.__name__ = funcname.encode("utf8") diff --git a/tests/test_roundtripping.py b/tests/test_roundtripping.py new file mode 100644 index 0000000..5360387 --- /dev/null +++ b/tests/test_roundtripping.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2014 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +try: + import unittest2 as unittest +except ImportError: + import unittest + +from ._test_tokenizer import TokenizerTestCase + +class TestRoundtripping(TokenizerTestCase, unittest.TestCase): + """Test cases for roundtripping tokens back to wikitext.""" + + @classmethod + def setUpClass(cls): + cls.roundtrip = True + + +if __name__ == "__main__": + unittest.main(verbosity=2) From 7489253e3289dd821144e324f375d31039cc4a6f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 21:45:17 -0500 Subject: [PATCH 072/102] Break at 80 cols for most lines. --- mwparserfromhell/parser/tokenizer.c | 64 ++++++++++++++++++++++-------------- mwparserfromhell/parser/tokenizer.py | 18 ++++++---- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index bad72ef..ce46388 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2456,13 +2456,15 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) /* Emit a table tag. */ -static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, - const char* tag, PyObject* style, PyObject* padding, - const char* close_open_markup, PyObject* contents, - const char* open_close_markup) +static int +Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, + const char* tag, PyObject* style, PyObject* padding, + const char* close_open_markup, PyObject* contents, + const char* open_close_markup) { - PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, *close_open_markup_unicode, - *open_close_kwargs, *open_close_markup_unicode; + PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, + *close_open_markup_unicode, *open_close_kwargs, + *open_close_markup_unicode; open_open_kwargs = PyDict_New(); if (!open_open_kwargs) @@ -2472,7 +2474,8 @@ static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_marku Py_DECREF(open_open_kwargs); goto fail_decref_all; } - PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_open_markup_unicode); + PyDict_SetItemString(open_open_kwargs, "wiki_markup", + open_open_markup_unicode); Py_DECREF(open_open_markup_unicode); if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) goto fail_decref_all; @@ -2494,7 +2497,8 @@ static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_marku Py_DECREF(close_open_kwargs); goto fail_decref_padding_contents; } - PyDict_SetItemString(close_open_kwargs, "wiki_markup", close_open_markup_unicode); + PyDict_SetItemString(close_open_kwargs, "wiki_markup", + close_open_markup_unicode); Py_DECREF(close_open_markup_unicode); } PyDict_SetItemString(close_open_kwargs, "padding", padding); @@ -2516,7 +2520,8 @@ static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_marku Py_DECREF(open_close_kwargs); return -1; } - PyDict_SetItemString(open_close_kwargs, "wiki_markup", open_close_markup_unicode); + PyDict_SetItemString(open_close_kwargs, "wiki_markup", + open_close_markup_unicode); Py_DECREF(open_close_markup_unicode); if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) return -1; @@ -2538,8 +2543,9 @@ static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_marku /* Parse until ``end_token`` as style attributes for a table. */ -static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, - int break_on_table_end) +static PyObject* +Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, + int break_on_table_end) { TagData *data = TagData_new(); PyObject *padding, *trash; @@ -2655,7 +2661,8 @@ static int Tokenizer_handle_table_start(Tokenizer* self) self->head += 2; } - if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, table, "|}")) + if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, + table, "|}")) return -1; // offset displacement done by _parse() self->head--; @@ -2665,7 +2672,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) /* Return the stack in order to handle the table end. */ -static PyObject * Tokenizer_handle_table_end(Tokenizer* self) +static PyObject* Tokenizer_handle_table_end(Tokenizer* self) { self->head += 2; return Tokenizer_pop(self); @@ -2720,7 +2727,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return -1; } - if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) + if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, + "")) return -1; // offset displacement done by _parse() self->head--; @@ -2739,8 +2747,9 @@ static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax. */ -static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, - const char *tag, uint64_t line_context) +static int +Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, + const char *tag, uint64_t line_context) { uint64_t old_context = self->topstack->context; uint64_t cell_context; @@ -2757,7 +2766,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, return 0; } - cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { trash = Tokenizer_pop(self); Py_XDECREF(trash); @@ -2772,7 +2782,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (cell_context & LC_TABLE_CELL_STYLE) { Py_DECREF(cell); self->head = reset + strlen(markup); - if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) + if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + line_context)) return -1; padding = Tokenizer_parse_as_table_style(self, '|', 0); if (BAD_ROUTE) { @@ -2790,7 +2801,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, } // Don't parse the style separator self->head++; - cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + line_context, 1); if (BAD_ROUTE) { Py_DECREF(padding); Py_DECREF(style); @@ -2818,10 +2830,12 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (style) { close_open_markup = "|"; } - if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, close_open_markup, cell, "")) + if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, + close_open_markup, cell, "")) return -1; // keep header/cell line contexts - self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); + self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | + LC_TABLE_TD_LINE); // offset displacement done by parse() self->head--; return 0; @@ -2831,7 +2845,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, Returns the context, stack, and whether to reset the cell for style in a tuple. */ -static PyObject* Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) +static PyObject* +Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) { if (reset_for_style) self->topstack->context |= LC_TABLE_CELL_STYLE; @@ -2844,7 +2859,8 @@ static PyObject* Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_ Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ -static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) +static int +Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) { if (context & LC_FAIL_NEXT) return -1; @@ -2895,7 +2911,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE d } else if (context & LC_FAIL_ON_LBRACE) { if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' && - Tokenizer_READ_BACKWARDS(self, 2) == '{')) { + Tokenizer_READ_BACKWARDS(self, 2) == '{')) { if (context & LC_TEMPLATE) self->topstack->context |= LC_FAIL_ON_EQUALS; else diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 527d364..ad4895e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1010,7 +1010,8 @@ class Tokenizer(object): if style: self._emit_all(style) if close_open_markup: - self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, padding=padding)) + self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, + padding=padding)) else: self._emit(tokens.TagCloseOpen(padding=padding)) if contents: @@ -1130,7 +1131,8 @@ class Tokenizer(object): return try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context | contexts.TABLE_CELL_STYLE) except BadRoute: self._head = reset self._pop() @@ -1140,7 +1142,8 @@ class Tokenizer(object): reset_for_style = cell_context & contexts.TABLE_CELL_STYLE if reset_for_style: self._head = reset + len(markup) - self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) + self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context) try: padding = self._parse_as_table_style("|") except BadRoute: @@ -1151,7 +1154,8 @@ class Tokenizer(object): # Don't parse the style separator self._head += 1 try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) + cell = self._parse(contexts.TABLE_OPEN | + contexts.TABLE_CELL_OPEN | line_context) except BadRoute: self._head = reset ret = self._pop() @@ -1160,9 +1164,11 @@ class Tokenizer(object): self._context = old_context close_open_markup = "|" if reset_for_style else None - self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") + self._emit_table_tag(markup, tag, style, padding, close_open_markup, + cell, "") # keep header/cell line contexts - self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) + self._context |= cell_context & (contexts.TABLE_TH_LINE | + contexts.TABLE_TD_LINE) # offset displacement done by parse() self._head -= 1 From 92cf8f2c03a8b339baa9e5a31c18c80ce635b2fb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 15:13:53 -0500 Subject: [PATCH 073/102] Add a couple more tests involving templates. --- tests/tokenizer/tables.mwtest | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index b411045..4e4fe74 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -90,7 +90,7 @@ output: [Text(text="{| | border=\"1\"| test || red | foo")] --- name: crazy_no_table_close -label: lost of opened wiki syntax without closes +label: lots of opened wiki syntax without closes input: "{{{ {{ {| Date: Wed, 22 Oct 2014 15:38:13 -0500 Subject: [PATCH 074/102] Add a test for tokenizer line 1384. --- tests/tokenizer/tables.mwtest | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 4e4fe74..59ad934 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -369,6 +369,13 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_fir --- +name: inappropriate_marker_at_line_start +label: an inappropriate marker (a right bracket) at the start of a line in the table +input: "{|\n}" +output: [Text(text="{|\n}")] + +--- + name: recursion_five_hundred_opens label: test potentially dangerous recursion: five hundred table openings, without spaces input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" From 457355d4bf976986f3471a2e1de39e9762a5dac3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 18:52:58 -0500 Subject: [PATCH 075/102] Remove try/except that is impossible to fail inside of. --- mwparserfromhell/parser/tokenizer.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index ad4895e..9787c5f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1119,8 +1119,8 @@ class Tokenizer(object): return self._pop() def _handle_table_cell(self, markup, tag, line_context): - """Parse as normal syntax unless we hit a style marker, then parse style - as HTML attributes and the remainder as normal syntax.""" + """Parse as normal syntax unless we hit a style marker, then parse + style as HTML attributes and the remainder as normal syntax.""" old_context = self._context reset = self._head reset_for_style, padding, style = False, "", None @@ -1144,12 +1144,7 @@ class Tokenizer(object): self._head = reset + len(markup) self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) - try: - padding = self._parse_as_table_style("|") - except BadRoute: - self._head = reset - self._pop() - raise + padding = self._parse_as_table_style("|") style = self._pop() # Don't parse the style separator self._head += 1 From 5d29bff918ad80b150bfc51aa407019ff51229e2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 19:04:11 -0500 Subject: [PATCH 076/102] Remove an incorrect usage of Py_XDECREF(). --- mwparserfromhell/parser/tokenizer.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ce46388..10a03a9 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -676,11 +676,8 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) RESET_ROUTE(); for (i = 0; i < braces; i++) text[i] = '{'; text[braces] = '\0'; - if (Tokenizer_emit_text_then_stack(self, text)) { - Py_XDECREF(text); + if (Tokenizer_emit_text_then_stack(self, text)) return -1; - } - Py_XDECREF(text); return 0; } else From 504b8bace08429e6a778f1fa69331cb5e849c043 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 19:22:50 -0500 Subject: [PATCH 077/102] Add test code for a missing branch of Tag.wiki_markup.setter; cleanup. --- mwparserfromhell/nodes/tag.py | 2 +- tests/test_tag.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index e9531e7..e3c7260 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -223,7 +223,7 @@ class Tag(Node): def wiki_markup(self, value): self._wiki_markup = str(value) if value else None if not value or not self.closing_wiki_markup: - self.closing_wiki_markup = str(value) if value else None + self._closing_wiki_markup = self._wiki_markup @self_closing.setter def self_closing(self, value): diff --git a/tests/test_tag.py b/tests/test_tag.py index b33b0c2..3beea98 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -246,6 +246,9 @@ class TestTag(TreeEqualityTestCase): node.closing_wiki_markup = "|}" self.assertEqual("|}", node.closing_wiki_markup) self.assertEqual("{|\n|}", node) + node.wiki_markup = "!!" + self.assertEqual("|}", node.closing_wiki_markup) + self.assertEqual("!!\n|}", node) node.wiki_markup = False self.assertFalse(node.closing_wiki_markup) self.assertEqual("\n
    ", node) From 913ff590c8e90f771e16e150b239147bd32f1c8d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 20:34:36 -0500 Subject: [PATCH 078/102] Cleanup; add a missing test. --- mwparserfromhell/parser/tokenizer.c | 6 ------ mwparserfromhell/parser/tokenizer.py | 2 +- tests/tokenizer/tags_wikimarkup.mwtest | 7 +++++++ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 10a03a9..faed5d7 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2783,12 +2783,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, line_context)) return -1; padding = Tokenizer_parse_as_table_style(self, '|', 0); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - self->head = reset; - return 0; - } if (!padding) return -1; style = Tokenizer_pop(self); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9787c5f..dd5d6d9 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1325,7 +1325,7 @@ class Tokenizer(object): elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() if this == "\n": - # kill potential table contexts + # Kill potential table contexts self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS # Start of table parsing elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 04f617a..c709ba7 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -447,6 +447,13 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tag --- +name: dt_dd_mix4 +label: another example of correct dt/dd usage, with a trigger for a specific parse route +input: ";foo]:bar" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo]"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] + +--- + name: ul_ol_dt_dd_mix label: an assortment of uls, ols, dds, and dts input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo" From e1ebb59b9e1be3fe2ffd64c679e02983234d20ae Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 22:59:42 -0500 Subject: [PATCH 079/102] Ensure token list is copied before being fed to the builder. --- tests/_test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index e44280b..17d588b 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -53,7 +53,7 @@ class TokenizerTestCase(object): def inner(self): if hasattr(self, "roundtrip"): expected = data["input"] - actual = str(Builder().build(data["output"])) + actual = str(Builder().build(data["output"][:])) else: expected = data["output"] actual = self.tokenizer().tokenize(data["input"]) From 640005dbb2eb641572f9880aaa72c3c6347802f9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Oct 2014 21:27:21 -0500 Subject: [PATCH 080/102] Tokenizer cleanup; make inline table syntax invalid as it should be. --- mwparserfromhell/parser/tokenizer.c | 56 ++++++++++++++--------------- mwparserfromhell/parser/tokenizer.py | 70 +++++++++++++++++------------------- 2 files changed, 61 insertions(+), 65 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index faed5d7..c53a420 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2423,34 +2423,6 @@ static int Tokenizer_handle_dl_term(Tokenizer* self) } /* - Handle the end of the stream of wikitext. -*/ -static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) -{ - PyObject *token, *text, *trash; - int single; - - if (context & AGG_FAIL) { - if (context & LC_TAG_BODY) { - token = PyList_GET_ITEM(self->topstack->stack, 1); - text = PyObject_GetAttrString(token, "text"); - if (!text) - return NULL; - single = IS_SINGLE(text); - Py_DECREF(text); - if (single) - return Tokenizer_handle_single_tag_end(self); - } - else if (context & AGG_DOUBLE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - } - return Tokenizer_fail_route(self); - } - return Tokenizer_pop(self); -} - -/* Emit a table tag. */ static int @@ -2847,6 +2819,34 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) } /* + Handle the end of the stream of wikitext. +*/ +static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) +{ + PyObject *token, *text, *trash; + int single; + + if (context & AGG_FAIL) { + if (context & LC_TAG_BODY) { + token = PyList_GET_ITEM(self->topstack->stack, 1); + text = PyObject_GetAttrString(token, "text"); + if (!text) + return NULL; + single = IS_SINGLE(text); + Py_DECREF(text); + if (single) + return Tokenizer_handle_single_tag_end(self); + } + else if (context & AGG_DOUBLE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } + return Tokenizer_fail_route(self); + } + return Tokenizer_pop(self); +} + +/* Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index dd5d6d9..7921e7c 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -991,17 +991,6 @@ class Tokenizer(object): else: self._emit_text("\n") - def _handle_end(self): - """Handle the end of the stream of wikitext.""" - if self._context & contexts.FAIL: - if self._context & contexts.TAG_BODY: - if is_single(self._stack[1].text): - return self._handle_single_tag_end() - if self._context & contexts.DOUBLE: - self._pop() - self._fail_route() - return self._pop() - def _emit_table_tag(self, open_open_markup, tag, style, padding, close_open_markup, contents, open_close_markup): """Emit a table tag.""" @@ -1020,22 +1009,21 @@ class Tokenizer(object): self._emit_text(tag) self._emit(tokens.TagCloseClose()) - def _parse_as_table_style(self, end_token, break_on_table_end=False): + def _parse_as_table_style(self, end_token): """Parse until ``end_token`` as style attributes for a table.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: - this, next = self._read(), self._read(1) - table_end = break_on_table_end and this == "|" and next == "}" + this = self._read() can_exit = (not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE) - if (this == end_token and can_exit) or table_end: + if this == end_token and can_exit: if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) if this.isspace(): data.padding_buffer["first"] += this return data.padding_buffer["first"] - elif this is self.END or table_end or this == end_token: + elif this is self.END or this == end_token: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die @@ -1052,31 +1040,27 @@ class Tokenizer(object): def _handle_table_start(self): """Handle the start of a table.""" reset = self._head + 1 - style, table = None, None self._head += 2 self._push(contexts.TABLE_OPEN) try: - padding = self._parse_as_table_style("\n", break_on_table_end=True) + padding = self._parse_as_table_style("\n") except BadRoute: self._head = reset self._emit_text("{|") return style = self._pop() - # continue to parse if it is NOT an inline table - if "\n" in padding: - self._head += 1 - try: - table = self._parse(contexts.TABLE_OPEN) - except BadRoute: - self._head = reset - self._emit_text("{|") - return - else: - # close tag - self._head += 2 + + self._head += 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + return + self._emit_table_tag("{|", "table", style, padding, None, table, "|}") - # offset displacement done by _parse() + # Offset displacement done by _parse(): self._head -= 1 def _handle_table_end(self): @@ -1087,7 +1071,6 @@ class Tokenizer(object): def _handle_table_row(self): """Parse as style until end of the line, then continue.""" reset = self._head - style, padding = None, "" self._head += 2 if not self._can_recurse(): self._emit_text("|-") @@ -1102,7 +1085,8 @@ class Tokenizer(object): self._pop() raise style = self._pop() - # don't parse the style separator + + # Don't parse the style separator: self._head += 1 try: row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) @@ -1110,8 +1094,9 @@ class Tokenizer(object): self._head = reset self._pop() raise + self._emit_table_tag("|-", "tr", style, padding, None, row, "") - # offset displacement done by parse() + # Offset displacement done by parse(): self._head -= 1 def _handle_table_row_end(self): @@ -1146,7 +1131,7 @@ class Tokenizer(object): line_context) padding = self._parse_as_table_style("|") style = self._pop() - # Don't parse the style separator + # Don't parse the style separator: self._head += 1 try: cell = self._parse(contexts.TABLE_OPEN | @@ -1161,10 +1146,10 @@ class Tokenizer(object): close_open_markup = "|" if reset_for_style else None self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") - # keep header/cell line contexts + # Keep header/cell line contexts: self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) - # offset displacement done by parse() + # Offset displacement done by parse(): self._head -= 1 def _handle_table_cell_end(self, reset_for_style=False): @@ -1176,6 +1161,17 @@ class Tokenizer(object): self._context &= ~contexts.TABLE_CELL_STYLE return self._pop(keep_context=True) + def _handle_end(self): + """Handle the end of the stream of wikitext.""" + if self._context & contexts.FAIL: + if self._context & contexts.TAG_BODY: + if is_single(self._stack[1].text): + return self._handle_single_tag_end() + if self._context & contexts.DOUBLE: + self._pop() + self._fail_route() + return self._pop() + def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" context = self._context From 4d4045902d1b56369c962a79a8e6a95e09a068c5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Oct 2014 21:27:55 -0500 Subject: [PATCH 081/102] Update table tests to reflect new grammar. --- tests/tokenizer/tables.mwtest | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 59ad934..e042467 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -6,9 +6,9 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- name: inline_table -label: correctly handle tables with close on the same line +label: tables with a close on the same line are not valid input: "{||}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=""), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [Text(text="{||}")] --- @@ -127,7 +127,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: characters_after_inline_table label: handle characters after an inline table close input: "{| |} tsta" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")] +output: [Text(text="{| |} tsta")] --- @@ -342,9 +342,9 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_fir --- name: inline_table_attributes -label: correctly handle attributes in inline tables +label: handle attributes in inline tables input: "{| foo="tee bar" |}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [Text(text='{| foo="tee bar" |}')] --- @@ -376,14 +376,28 @@ output: [Text(text="{|\n}")] --- -name: recursion_five_hundred_opens -label: test potentially dangerous recursion: five hundred table openings, without spaces -input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" -output: [Text(text="{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|")] +name: fake_close_near_start +label: a fake closing token at the end of the first line in the table +input: "{| class="wikitable" style="text-align: center; width=100%;|}\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"text-align:"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="center;"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="width"), TagAttrEquals(), Text(text="100%;|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- -name: recursion_one_hundred_opens -label: test potentially dangerous recursion: one hundred table openings, with spaces -input: "{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|" -output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] +name: fake_close_near_start_2 +label: a fake closing token at the end of the first line in the table +input: "{| class="wikitable|}"\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: junk_after_table_start +label: ignore more junk on the first line of the table +input: "{| class="wikitable" | foobar\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="foobar"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: junk_after_table_row +label: ignore junk on the first line of a table row +input: "{|\n|- foo="bar" | baz\n|blerp\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="bar"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="baz"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="blerp\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] From fb261450d8fa0d3e666fe48a000a6afd6694c89a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Oct 2014 21:40:50 -0500 Subject: [PATCH 082/102] Port tokenizer updates to C. --- mwparserfromhell/parser/tokenizer.c | 80 ++++++++++++++----------------------- 1 file changed, 31 insertions(+), 49 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c53a420..1b68b46 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2513,13 +2513,12 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, Parse until ``end_token`` as style attributes for a table. */ static PyObject* -Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, - int break_on_table_end) +Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) { TagData *data = TagData_new(); PyObject *padding, *trash; - Py_UNICODE this, next; - int can_exit, table_end; + Py_UNICODE this; + int can_exit; if (!data) return NULL; @@ -2527,10 +2526,8 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, while (1) { this = Tokenizer_READ(self, 0); - next = Tokenizer_READ(self, 1); can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE); - table_end = (break_on_table_end && this == '|' && next == '}'); - if ((this == end_token && can_exit) || table_end) { + if (this == end_token && can_exit) { if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) { if (Tokenizer_push_tag_buffer(self, data)) { TagData_dealloc(data); @@ -2545,7 +2542,7 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, return NULL; return padding; } - else if (!this || table_end || this == end_token) { + else if (!this || this == end_token) { if (self->topstack->context & LC_TAG_ATTR) { if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die @@ -2577,13 +2574,13 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, static int Tokenizer_handle_table_start(Tokenizer* self) { Py_ssize_t reset = self->head + 1; - PyObject *style, *padding, *newline_character; + PyObject *style, *padding; PyObject *table = NULL; self->head += 2; if(Tokenizer_push(self, LC_TABLE_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n', 1); + padding = Tokenizer_parse_as_table_style(self, '\n'); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; @@ -2599,41 +2596,27 @@ static int Tokenizer_handle_table_start(Tokenizer* self) return -1; } - newline_character = PyUnicode_FromString("\n"); - if (!newline_character) { + self->head++; + table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); Py_DECREF(padding); Py_DECREF(style); - return -1; - } - // continue to parse if it is NOT an inline table - if (PyUnicode_Contains(padding, newline_character)) { - Py_DECREF(newline_character); - self->head++; - table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); - if (BAD_ROUTE) { - Py_DECREF(padding); - Py_DECREF(style); - RESET_ROUTE(); - self->head = reset; - if (Tokenizer_emit_text(self, "{|")) - return -1; - return 0; - } - if (!table) { - Py_DECREF(padding); - Py_DECREF(style); + self->head = reset; + if (Tokenizer_emit_text(self, "{|")) return -1; - } - } else { - Py_DECREF(newline_character); - // close tag - self->head += 2; + return 0; + } + if (!table) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; } if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, table, "|}")) return -1; - // offset displacement done by _parse() + // Offset displacement done by _parse() self->head--; return 0; } @@ -2665,7 +2648,7 @@ static int Tokenizer_handle_table_row(Tokenizer* self) if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n', 0); + padding = Tokenizer_parse_as_table_style(self, '\n'); if (BAD_ROUTE) { trash = Tokenizer_pop(self); Py_XDECREF(trash); @@ -2679,7 +2662,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) Py_DECREF(padding); return -1; } - // don't parse the style separator + + // Don't parse the style separator self->head++; row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); if (BAD_ROUTE) { @@ -2696,10 +2680,9 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return -1; } - if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, - "")) + if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) return -1; - // offset displacement done by _parse() + // Offset displacement done by _parse() self->head--; return 0; } @@ -2754,7 +2737,7 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) return -1; - padding = Tokenizer_parse_as_table_style(self, '|', 0); + padding = Tokenizer_parse_as_table_style(self, '|'); if (!padding) return -1; style = Tokenizer_pop(self); @@ -2796,10 +2779,9 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, close_open_markup, cell, "")) return -1; - // keep header/cell line contexts - self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | - LC_TABLE_TD_LINE); - // offset displacement done by parse() + // Keep header/cell line contexts + self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); + // Offset displacement done by parse() self->head--; return 0; } @@ -3092,7 +3074,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { if (Tokenizer_handle_dl_term(self)) return NULL; - // kill potential table contexts + // Kill potential table contexts if (this == '\n') self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; } @@ -3130,7 +3112,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) { return Tokenizer_handle_table_cell_end(self, 1); } - // on newline, clear out cell line contexts + // On newline, clear out cell line contexts else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) { self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; if (Tokenizer_emit_char(self, this)) From 8480381a31b5da4571e32f75a18f9f15e03d770c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Oct 2014 21:53:55 -0500 Subject: [PATCH 083/102] Credit for table parsing code. [skip ci] --- CHANGELOG | 2 +- docs/changelog.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 9c05482..3471531 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,7 +2,7 @@ v0.4 (unreleased): - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. -- Added support for parsing wikicode tables. +- Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in scripts/memtest.py. - Added a script to do releases in scripts/release.sh. - skip_style_tags can now be passed to mwparserfromhell.parse() (previously, diff --git a/docs/changelog.rst b/docs/changelog.rst index 1854fa0..b3e7548 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,7 +9,7 @@ Unreleased - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. -- Added support for parsing wikicode tables. +- Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to do releases in :file:`scripts/release.sh`. - *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() From 9fc4b909e150cd786e97caf7daeb479733e5330e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Oct 2014 03:40:37 -0500 Subject: [PATCH 084/102] Refactor a lot of table error recovery code. --- mwparserfromhell/parser/contexts.py | 4 +- mwparserfromhell/parser/tokenizer.c | 100 +++++++++++++++-------------------- mwparserfromhell/parser/tokenizer.h | 2 +- mwparserfromhell/parser/tokenizer.py | 82 ++++++++++++---------------- tests/tokenizer/tables.mwtest | 7 +++ 5 files changed, 87 insertions(+), 108 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index ef44ce2..17912cb 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -171,7 +171,7 @@ TABLE_ROW_OPEN = 1 << 33 TABLE_TD_LINE = 1 << 34 TABLE_TH_LINE = 1 << 35 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE -TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN + +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + TABLE_TD_LINE + TABLE_TH_LINE) # Global contexts: @@ -184,6 +184,6 @@ FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE) UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) -DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE +DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 1b68b46..301ecfc 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2510,10 +2510,9 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, } /* - Parse until ``end_token`` as style attributes for a table. + Handle style attributes for a table until an ending token. */ -static PyObject* -Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) +static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) { TagData *data = TagData_new(); PyObject *padding, *trash; @@ -2569,9 +2568,9 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) } /* - Handle the start of a table. + Parse a wikicode table by starting with the first line. */ -static int Tokenizer_handle_table_start(Tokenizer* self) +static int Tokenizer_parse_table(Tokenizer* self) { Py_ssize_t reset = self->head + 1; PyObject *style, *padding; @@ -2580,7 +2579,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) if(Tokenizer_push(self, LC_TABLE_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n'); + padding = Tokenizer_handle_table_style(self, '\n'); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; @@ -2622,20 +2621,10 @@ static int Tokenizer_handle_table_start(Tokenizer* self) } /* - Return the stack in order to handle the table end. -*/ -static PyObject* Tokenizer_handle_table_end(Tokenizer* self) -{ - self->head += 2; - return Tokenizer_pop(self); -} - -/* Parse as style until end of the line, then continue. */ static int Tokenizer_handle_table_row(Tokenizer* self) { - Py_ssize_t reset = self->head; PyObject *padding, *style, *row, *trash; self->head += 2; @@ -2648,11 +2637,10 @@ static int Tokenizer_handle_table_row(Tokenizer* self) if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n'); + padding = Tokenizer_handle_table_style(self, '\n'); if (BAD_ROUTE) { trash = Tokenizer_pop(self); Py_XDECREF(trash); - self->head = reset; return 0; } if (!padding) @@ -2666,14 +2654,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) // Don't parse the style separator self->head++; row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - Py_DECREF(padding); - Py_DECREF(style); - self->head = reset; - return 0; - } if (!row) { Py_DECREF(padding); Py_DECREF(style); @@ -2688,14 +2668,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) } /* - Return the stack in order to handle the table row end. -*/ -static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) -{ - return Tokenizer_pop(self); -} - -/* Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax. */ @@ -2705,11 +2677,10 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, { uint64_t old_context = self->topstack->context; uint64_t cell_context; - Py_ssize_t reset = self->head; - PyObject *padding, *cell, *trash; - PyObject *style = NULL; + PyObject *padding, *cell, *style = NULL; const char *close_open_markup = NULL; self->head += strlen(markup); + Py_ssize_t reset = self->head; if (!Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_emit_text(self, markup)) @@ -2720,12 +2691,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - self->head = reset; - return 0; - } if (!cell) return -1; cell_context = self->topstack->context; @@ -2733,11 +2698,11 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (cell_context & LC_TABLE_CELL_STYLE) { Py_DECREF(cell); - self->head = reset + strlen(markup); + self->head = reset; if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) return -1; - padding = Tokenizer_parse_as_table_style(self, '|'); + padding = Tokenizer_handle_table_style(self, '|'); if (!padding) return -1; style = Tokenizer_pop(self); @@ -2749,14 +2714,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, self->head++; cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); - if (BAD_ROUTE) { - Py_DECREF(padding); - Py_DECREF(style); - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - self->head = reset; - return 0; - } if (!cell) { Py_DECREF(padding); Py_DECREF(style); @@ -2801,6 +2758,23 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) } /* + Return the stack in order to handle the table row end. +*/ +static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) +{ + return Tokenizer_pop(self); +} + +/* + Return the stack in order to handle the table end. +*/ +static PyObject* Tokenizer_handle_table_end(Tokenizer* self) +{ + self->head += 2; + return Tokenizer_pop(self); +} + +/* Handle the end of the stream of wikitext. */ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) @@ -2819,9 +2793,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) if (single) return Tokenizer_handle_single_tag_end(self); } - else if (context & AGG_DOUBLE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); + else { + if (context & LC_TABLE_CELL_OPEN) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + context = self->topstack->context; + } + if (context & AGG_DOUBLE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } } return Tokenizer_fail_route(self); } @@ -3082,7 +3063,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) // Start of table parsing else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { if (Tokenizer_CAN_RECURSE(self)) { - if (Tokenizer_handle_table_start(self)) + if (Tokenizer_parse_table(self)) return NULL; } else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) @@ -3197,7 +3178,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) self->skip_style_tags = skip_style_tags; tokens = Tokenizer_parse(self, context, 1); - if (!tokens && !PyErr_Occurred()) { + if ((!tokens && !PyErr_Occurred()) || self->topstack) { if (!ParserError) { if (load_exceptions()) return NULL; @@ -3206,6 +3187,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) RESET_ROUTE(); PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); } + else if (self->topstack) + PyErr_SetString(ParserError, + "C tokenizer exited with non-empty token stack"); else PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); return NULL; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 8d2d428..33ba0e1 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -175,7 +175,7 @@ static PyObject* TagCloseClose; #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) -#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7921e7c..3ac25a5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1009,8 +1009,8 @@ class Tokenizer(object): self._emit_text(tag) self._emit(tokens.TagCloseClose()) - def _parse_as_table_style(self, end_token): - """Parse until ``end_token`` as style attributes for a table.""" + def _handle_table_style(self, end_token): + """Handle style attributes for a table until ``end_token``.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: @@ -1037,14 +1037,13 @@ class Tokenizer(object): self._handle_tag_data(data, this) self._head += 1 - def _handle_table_start(self): - """Handle the start of a table.""" + def _parse_table(self): + """Parse a wikicode table by starting with the first line.""" reset = self._head + 1 self._head += 2 - self._push(contexts.TABLE_OPEN) try: - padding = self._parse_as_table_style("\n") + padding = self._handle_table_style("\n") except BadRoute: self._head = reset self._emit_text("{|") @@ -1063,14 +1062,8 @@ class Tokenizer(object): # Offset displacement done by _parse(): self._head -= 1 - def _handle_table_end(self): - """Return the stack in order to handle the table end.""" - self._head += 2 - return self._pop() - def _handle_table_row(self): """Parse as style until end of the line, then continue.""" - reset = self._head self._head += 2 if not self._can_recurse(): self._emit_text("|-") @@ -1079,67 +1072,47 @@ class Tokenizer(object): self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) try: - padding = self._parse_as_table_style("\n") + padding = self._handle_table_style("\n") except BadRoute: - self._head = reset self._pop() raise style = self._pop() # Don't parse the style separator: self._head += 1 - try: - row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) - except BadRoute: - self._head = reset - self._pop() - raise + row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) self._emit_table_tag("|-", "tr", style, padding, None, row, "") # Offset displacement done by parse(): self._head -= 1 - def _handle_table_row_end(self): - """Return the stack in order to handle the table row end.""" - return self._pop() - def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax.""" old_context = self._context - reset = self._head - reset_for_style, padding, style = False, "", None + padding, style = "", None self._head += len(markup) + reset = self._head if not self._can_recurse(): self._emit_text(markup) self._head -= 1 return - try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | - line_context | contexts.TABLE_CELL_STYLE) - except BadRoute: - self._head = reset - self._pop() - raise + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context | contexts.TABLE_CELL_STYLE) cell_context = self._context self._context = old_context reset_for_style = cell_context & contexts.TABLE_CELL_STYLE if reset_for_style: - self._head = reset + len(markup) + self._head = reset self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) - padding = self._parse_as_table_style("|") + padding = self._handle_table_style("|") style = self._pop() # Don't parse the style separator: self._head += 1 - try: - cell = self._parse(contexts.TABLE_OPEN | - contexts.TABLE_CELL_OPEN | line_context) - except BadRoute: - self._head = reset - ret = self._pop() - raise + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context) cell_context = self._context self._context = old_context @@ -1161,12 +1134,23 @@ class Tokenizer(object): self._context &= ~contexts.TABLE_CELL_STYLE return self._pop(keep_context=True) + def _handle_table_row_end(self): + """Return the stack in order to handle the table row end.""" + return self._pop() + + def _handle_table_end(self): + """Return the stack in order to handle the table end.""" + self._head += 2 + return self._pop() + def _handle_end(self): """Handle the end of the stream of wikitext.""" if self._context & contexts.FAIL: if self._context & contexts.TAG_BODY: if is_single(self._stack[1].text): return self._handle_single_tag_end() + if self._context & contexts.TABLE_CELL_OPEN: + self._pop() if self._context & contexts.DOUBLE: self._pop() self._fail_route() @@ -1327,19 +1311,19 @@ class Tokenizer(object): elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): - self._handle_table_start() + self._parse_table() else: self._emit_text("{|") elif self._context & contexts.TABLE_OPEN: - if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: + if this == next == "|" and self._context & contexts.TABLE_TD_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) - elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE: + elif this == next == "|" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) - elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE: + elif this == next == "!" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) @@ -1387,6 +1371,10 @@ class Tokenizer(object): self._text = [segment for segment in split if segment] self._head = self._global = self._depth = self._cycles = 0 try: - return self._parse(context) + tokens = self._parse(context) except BadRoute: # pragma: no cover (untestable/exceptional case) raise ParserError("Python tokenizer exited with BadRoute") + if self._stacks: # pragma: no cover (untestable/exceptional case) + err = "Python tokenizer exited with non-empty token stack" + raise ParserError(err) + return tokens diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index e042467..16012cf 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -61,6 +61,13 @@ output: [Text(text="{| \n|- \n ")] --- +name: no_table_close_row_and_cell +label: no table close while inside a cell inside a row +input: "{| \n|- \n|" +output: [Text(text="{| \n|- \n|")] + +--- + name: no_table_close_attributes label: don't parse attributes as attributes if the table doesn't exist input: "{| border="1"" From a15f6172c09ee22aae4899547975eec4b2b0ced3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Oct 2014 03:43:22 -0500 Subject: [PATCH 085/102] Minor bugfix. --- mwparserfromhell/parser/tokenizer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 301ecfc..38e3a4c 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2677,10 +2677,12 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, { uint64_t old_context = self->topstack->context; uint64_t cell_context; + Py_ssize_t reset; PyObject *padding, *cell, *style = NULL; const char *close_open_markup = NULL; + self->head += strlen(markup); - Py_ssize_t reset = self->head; + reset = self->head; if (!Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_emit_text(self, markup)) From 0ae8460cb7a5c30383dec33ae8d045bb5f63a28b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Oct 2014 16:33:50 -0500 Subject: [PATCH 086/102] Add changelog entry for roundtripping tests. [skip ci] --- CHANGELOG | 2 ++ docs/changelog.rst | 2 ++ 2 files changed, 4 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 3471531..848305d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -26,6 +26,8 @@ v0.4 (unreleased): an incorrect node tree to be build. - Fixed a parser bug involving nested tags, and another involving comments in template names. +- Added tests to ensure that parsed trees convert back to wikicode without + unintentional modifications. - Test coverage has been improved, and some minor related bugs have been fixed. - Updated and fixed some documentation. diff --git a/docs/changelog.rst b/docs/changelog.rst index b3e7548..a04410f 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -37,6 +37,8 @@ Unreleased exception or allow an incorrect node tree to be build. - Fixed a parser bug involving nested tags, and another involving comments in template names. +- Added tests to ensure that parsed trees convert back to wikicode without + unintentional modifications. - Test coverage has been improved, and some minor related bugs have been fixed. - Updated and fixed some documentation. From 5f6afe7bb58b45baa6752f0c968577df6033943e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 24 Dec 2014 13:18:46 -0500 Subject: [PATCH 087/102] Fix version string to 0.4.dev0. --- mwparserfromhell/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 9c29fd2..287536a 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.4.dev" +__version__ = "0.4.dev0" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin, From 47b44a973092c5de0cefedcce5d11f39f7652d5a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 24 Dec 2014 13:19:24 -0500 Subject: [PATCH 088/102] Add a failing test for #89. --- tests/tokenizer/tags.mwtest | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index f979329..55b18f7 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -611,3 +611,10 @@ name: capitalization label: caps should be ignored within tag names input: "{{test}}" output: [TagOpenOpen(), Text(text="NoWiKi"), TagCloseOpen(padding=""), Text(text="{{test}}"), TagOpenClose(), Text(text="nOwIkI"), TagCloseClose()] + +--- + +name: unparsable_with_intermediates +label: an unparsable tag with intermediate tags inside of it +input: "" +output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text=""), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] From a00c645bd8692efdb3a667a7dd8f3d7bc7e9da44 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 24 Dec 2014 14:38:50 -0500 Subject: [PATCH 089/102] Fix handling of tag closes within (fixes #89). --- mwparserfromhell/parser/tokenizer.c | 69 ++++++++++++++++++++++++++++++------ mwparserfromhell/parser/tokenizer.py | 14 ++++++-- tests/tokenizer/tags.mwtest | 14 ++++++++ 3 files changed, 83 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 38e3a4c..7d07ed8 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -69,15 +69,19 @@ static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, /* Sanitize the name of a tag so it can be compared with others for equality. */ -static PyObject* strip_tag_name(PyObject* token) +static PyObject* strip_tag_name(PyObject* token, int take_attr) { PyObject *text, *rstripped, *lowered; - text = PyObject_GetAttrString(token, "text"); - if (!text) - return NULL; - rstripped = PyObject_CallMethod(text, "rstrip", NULL); - Py_DECREF(text); + if (take_attr) { + text = PyObject_GetAttrString(token, "text"); + if (!text) + return NULL; + rstripped = PyObject_CallMethod(text, "rstrip", NULL); + Py_DECREF(text); + } + else + rstripped = PyObject_CallMethod(token, "rstrip", NULL); if (!rstripped) return NULL; lowered = PyObject_CallMethod(rstripped, "lower", NULL); @@ -1812,8 +1816,9 @@ static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self) valid = 0; break; case 1: { - so = strip_tag_name(first); - sc = strip_tag_name(PyList_GET_ITEM(self->topstack->stack, 1)); + so = strip_tag_name(first, 1); + sc = strip_tag_name( + PyList_GET_ITEM(self->topstack->stack, 1), 1); if (so && sc) { if (PyUnicode_Compare(so, sc)) valid = 0; @@ -1848,7 +1853,11 @@ static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self) */ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self) { + Textbuffer* buffer; + PyObject *buf_tmp, *end_tag, *start_tag; Py_UNICODE this, next; + Py_ssize_t reset; + int cmp; while (1) { this = Tokenizer_READ(self, 0); @@ -1856,10 +1865,48 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self) if (!this) return Tokenizer_fail_route(self); else if (this == '<' && next == '/') { - if (Tokenizer_handle_tag_open_close(self)) + self->head += 2; + reset = self->head - 1; + buffer = Textbuffer_new(); + if (!buffer) return NULL; - self->head++; - return Tokenizer_parse(self, 0, 0); + while ((this = Tokenizer_READ(self, 0))) { + if (this == '>') { + buf_tmp = Textbuffer_render(buffer); + if (!buf_tmp) + return NULL; + end_tag = strip_tag_name(buf_tmp, 0); + Py_DECREF(buf_tmp); + if (!end_tag) + return NULL; + start_tag = strip_tag_name( + PyList_GET_ITEM(self->topstack->stack, 1), 1); + if (!start_tag) + return NULL; + cmp = PyUnicode_Compare(start_tag, end_tag); + Py_DECREF(end_tag); + Py_DECREF(start_tag); + if (cmp) + goto no_matching_end; + if (Tokenizer_emit(self, TagOpenClose)) + return NULL; + if (Tokenizer_emit_textbuffer(self, buffer, 0)) + return NULL; + if (Tokenizer_emit(self, TagCloseClose)) + return NULL; + return Tokenizer_pop(self); + } + if (!this || this == '\n') { + no_matching_end: + Textbuffer_dealloc(buffer); + self->head = reset; + if (Tokenizer_emit_text(self, "head++; + } } else if (this == '&') { if (Tokenizer_parse_entity(self)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 3ac25a5..607cc69 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -735,14 +735,22 @@ class Tokenizer(object): def _handle_blacklisted_tag(self): """Handle the body of an HTML tag that is parser-blacklisted.""" + strip = lambda text: text.rstrip().lower() while True: this, next = self._read(), self._read(1) if this is self.END: self._fail_route() elif this == "<" and next == "/": - self._handle_tag_open_close() - self._head += 1 - return self._parse(push=False) + self._head += 3 + if self._read() != ">" or (strip(self._read(-1)) != + strip(self._stack[1].text)): + self._head -= 1 + self._emit_text("foo" output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text=""), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] + +--- + +name: unparsable_with_intermediates_normalize +label: an unparsable tag with intermediate tags inside of it, requiring normalization +input: "" +output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text=""), TagOpenClose(), Text(text="nowIKI "), TagCloseClose()] From 53e92ae04c81678f034a456189501df556fe30d3 Mon Sep 17 00:00:00 2001 From: ricordisamoa Date: Wed, 24 Dec 2014 23:26:36 +0100 Subject: [PATCH 090/102] Update README.rst for Pywikibot core The new version ('core') is much more used and developed than the previous one ('compat'). --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index b6bf7e7..27b112a 100644 --- a/README.rst +++ b/README.rst @@ -121,13 +121,13 @@ Integration ``Page`` objects have a ``parse`` method that essentially calls ``mwparserfromhell.parse()`` on ``page.get()``. -If you're using Pywikipedia_, your code might look like this:: +If you're using Pywikibot_, your code might look like this:: import mwparserfromhell - import wikipedia as pywikibot + import pywikibot def parse(title): - site = pywikibot.getSite() + site = pywikibot.Site() page = pywikibot.Page(site, title) text = page.get() return mwparserfromhell.parse(text) @@ -158,5 +158,5 @@ If you're not using a library, you can parse any page using the following code .. _StackOverflow question: http://stackoverflow.com/questions/2817869/error-unable-to-find-vcvarsall-bat .. _get pip: http://pypi.python.org/pypi/pip .. _EarwigBot: https://github.com/earwig/earwigbot -.. _Pywikipedia: https://www.mediawiki.org/wiki/Manual:Pywikipediabot +.. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot .. _API: http://mediawiki.org/wiki/API From 0f16d0c63ee024ea38391805bab7ad54b46bfd92 Mon Sep 17 00:00:00 2001 From: Kunal Mehta Date: Sat, 27 Dec 2014 21:19:17 -0800 Subject: [PATCH 091/102] Target documentation for Python 3 usage 2 is dead, long live 3. Mainly turning print info a function and urllib import fixes --- README.rst | 33 +++++++++++++++++---------------- docs/integration.rst | 4 ++-- docs/usage.rst | 36 ++++++++++++++++++------------------ mwparserfromhell/wikicode.py | 2 +- 4 files changed, 38 insertions(+), 37 deletions(-) diff --git a/README.rst b/README.rst index 27b112a..93dee92 100644 --- a/README.rst +++ b/README.rst @@ -47,19 +47,19 @@ For example:: >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" >>> wikicode = mwparserfromhell.parse(text) - >>> print wikicode + >>> print(wikicode) I has a template! {{foo|bar|baz|eggs=spam}} See it? >>> templates = wikicode.filter_templates() - >>> print templates + >>> print(templates) ['{{foo|bar|baz|eggs=spam}}'] >>> template = templates[0] - >>> print template.name + >>> print(template.name) foo - >>> print template.params + >>> print(template.params) ['bar', 'baz', 'eggs=spam'] - >>> print template.get(1).value + >>> print(template.get(1).value) bar - >>> print template.get("eggs").value + >>> print(template.get("eggs").value) spam Since nodes can contain other nodes, getting nested templates is trivial:: @@ -73,14 +73,14 @@ templates manually. This is possible because nodes can contain additional ``Wikicode`` objects:: >>> code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}") - >>> print code.filter_templates(recursive=False) + >>> print(code.filter_templates(recursive=False)) ['{{foo|this {{includes a|template}}}}'] >>> foo = code.filter_templates(recursive=False)[0] - >>> print foo.get(1).value + >>> print(foo.get(1).value) this {{includes a|template}} - >>> print foo.get(1).value.filter_templates()[0] + >>> print(foo.get(1).value.filter_templates()[0]) {{includes a|template}} - >>> print foo.get(1).value.filter_templates()[0].get(1).value + >>> print(foo.get(1).value.filter_templates()[0].get(1).value) template Templates can be easily modified to add, remove, or alter params. ``Wikicode`` @@ -95,19 +95,19 @@ whitespace:: ... if template.name.matches("Cleanup") and not template.has("date"): ... template.add("date", "July 2012") ... - >>> print code + >>> print(code) {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{uncategorized}} >>> code.replace("{{uncategorized}}", "{{bar-stub}}") - >>> print code + >>> print(code) {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} - >>> print code.filter_templates() + >>> print(code.filter_templates()) ['{{cleanup|date=July 2012}}', '{{bar-stub}}'] You can then convert ``code`` back into a regular ``unicode`` object (for saving the page!) by calling ``unicode()`` on it:: >>> text = unicode(code) - >>> print text + >>> print(text) {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} >>> text == code True @@ -136,14 +136,15 @@ If you're not using a library, you can parse any page using the following code (via the API_):: import json - import urllib + from urllib.parse import urlencode + from urllib.request import urlopen import mwparserfromhell API_URL = "http://en.wikipedia.org/w/api.php" def parse(title): data = {"action": "query", "prop": "revisions", "rvlimit": 1, "rvprop": "content", "format": "json", "titles": title} - raw = urllib.urlopen(API_URL, urllib.urlencode(data)).read() + raw = urlopen(API_URL, urlencode(data).encode()).read() res = json.loads(raw) text = res["query"]["pages"].values()[0]["revisions"][0]["*"] return mwparserfromhell.parse(text) diff --git a/docs/integration.rst b/docs/integration.rst index 102b3b9..f6f3610 100644 --- a/docs/integration.rst +++ b/docs/integration.rst @@ -22,12 +22,12 @@ If you're not using a library, you can parse any page using the following code (via the API_):: import json - import urllib + import urllib.request import mwparserfromhell API_URL = "http://en.wikipedia.org/w/api.php" def parse(title): - raw = urllib.urlopen(API_URL, data).read() + raw = urllib.request.urlopen(API_URL, data).read() res = json.loads(raw) text = res["query"]["pages"].values()[0]["revisions"][0]["*"] return mwparserfromhell.parse(text) diff --git a/docs/usage.rst b/docs/usage.rst index c471397..a1adfce 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -12,19 +12,19 @@ extra methods. For example:: >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" >>> wikicode = mwparserfromhell.parse(text) - >>> print wikicode + >>> print(wikicode) I has a template! {{foo|bar|baz|eggs=spam}} See it? >>> templates = wikicode.filter_templates() - >>> print templates + >>> print(templates) ['{{foo|bar|baz|eggs=spam}}'] >>> template = templates[0] - >>> print template.name + >>> print(template.name) foo - >>> print template.params + >>> print(template.params) ['bar', 'baz', 'eggs=spam'] - >>> print template.get(1).value + >>> print(template.get(1).value) bar - >>> print template.get("eggs").value + >>> print(template.get("eggs").value) spam Since nodes can contain other nodes, getting nested templates is trivial:: @@ -38,14 +38,14 @@ templates manually. This is possible because nodes can contain additional :class:`.Wikicode` objects:: >>> code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}") - >>> print code.filter_templates(recursive=False) + >>> print(code.filter_templates(recursive=False)) ['{{foo|this {{includes a|template}}}}'] >>> foo = code.filter_templates(recursive=False)[0] - >>> print foo.get(1).value + >>> print(foo.get(1).value) this {{includes a|template}} - >>> print foo.get(1).value.filter_templates()[0] + >>> print(foo.get(1).value.filter_templates()[0]) {{includes a|template}} - >>> print foo.get(1).value.filter_templates()[0].get(1).value + >>> print(foo.get(1).value.filter_templates()[0].get(1).value) template Templates can be easily modified to add, remove, or alter params. @@ -61,24 +61,24 @@ takes care of capitalization and whitespace:: ... if template.name.matches("Cleanup") and not template.has("date"): ... template.add("date", "July 2012") ... - >>> print code + >>> print(code) {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{uncategorized}} >>> code.replace("{{uncategorized}}", "{{bar-stub}}") - >>> print code + >>> print(code) {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} - >>> print code.filter_templates() + >>> print(code.filter_templates()) ['{{cleanup|date=July 2012}}', '{{bar-stub}}'] -You can then convert ``code`` back into a regular :class:`unicode` object (for -saving the page!) by calling :func:`unicode` on it:: +You can then convert ``code`` back into a regular :class:`str` object (for +saving the page!) by calling :func:`str` on it:: - >>> text = unicode(code) - >>> print text + >>> text = str(code) + >>> print(text) {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} >>> text == code True -(Likewise, use :func:`str(code) ` in Python 3.) +(Likewise, use :func:`unicode(code) ` in Python 2.) For more tips, check out :class:`Wikicode's full method list <.Wikicode>` and the :mod:`list of Nodes <.nodes>`. diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index c24bc5f..ebfa9c7 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -567,7 +567,7 @@ class Wikicode(StringMixIn): following:: >>> text = "Lorem ipsum {{foo|bar|{{baz}}|spam=eggs}}" - >>> print mwparserfromhell.parse(text).get_tree() + >>> print(mwparserfromhell.parse(text).get_tree()) Lorem ipsum {{ foo From c9ef040a0fed403a7bfec826b3b0485ecb984d27 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 28 Dec 2014 15:41:25 -0500 Subject: [PATCH 092/102] lego missed a spot in #91 --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 93dee92..ae338e7 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,7 @@ Normal usage is rather straightforward (where ``text`` is page text):: >>> wikicode = mwparserfromhell.parse(text) ``wikicode`` is a ``mwparserfromhell.Wikicode`` object, which acts like an -ordinary ``unicode`` object (or ``str`` in Python 3) with some extra methods. +ordinary ``str`` object (or ``unicode`` in Python 2) with some extra methods. For example:: >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" From 77644ea0edbccadcd532cd932996a416105f38d5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 28 Dec 2014 15:42:04 -0500 Subject: [PATCH 093/102] lego missed a spot in #91 --- docs/usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.rst b/docs/usage.rst index a1adfce..ee667fd 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -7,7 +7,7 @@ Normal usage is rather straightforward (where ``text`` is page text):: >>> wikicode = mwparserfromhell.parse(text) ``wikicode`` is a :class:`mwparserfromhell.Wikicode <.Wikicode>` object, which -acts like an ordinary ``unicode`` object (or ``str`` in Python 3) with some +acts like an ordinary ``str`` object (or ``unicode`` in Python 2) with some extra methods. For example:: >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" From c8b8cd6a605fdbd2d9cd73d4e9b486f7e3c883ac Mon Sep 17 00:00:00 2001 From: Kunal Mehta Date: Sun, 28 Dec 2014 12:50:19 -0800 Subject: [PATCH 094/102] Another thing missed in #91 --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index ae338e7..45c7286 100644 --- a/README.rst +++ b/README.rst @@ -103,16 +103,16 @@ whitespace:: >>> print(code.filter_templates()) ['{{cleanup|date=July 2012}}', '{{bar-stub}}'] -You can then convert ``code`` back into a regular ``unicode`` object (for -saving the page!) by calling ``unicode()`` on it:: +You can then convert ``code`` back into a regular ``str`` object (for +saving the page!) by calling ``str()`` on it:: - >>> text = unicode(code) + >>> text = str(code) >>> print(text) {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} >>> text == code True -Likewise, use ``str(code)`` in Python 3. +Likewise, use ``unicode(code)`` in Python 2. Integration ----------- From d30222e126d86ed82f19ed091817391c139c62bc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 28 Dec 2014 15:56:18 -0500 Subject: [PATCH 095/102] Fix integration docs based on README.md --- docs/integration.rst | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/integration.rst b/docs/integration.rst index f6f3610..bbd00bb 100644 --- a/docs/integration.rst +++ b/docs/integration.rst @@ -7,13 +7,13 @@ Integration :func:`mwparserfromhell.parse() ` on :meth:`~earwigbot.wiki.page.Page.get`. -If you're using Pywikipedia_, your code might look like this:: +If you're using Pywikibot_, your code might look like this:: import mwparserfromhell - import wikipedia as pywikibot + import pywikibot def parse(title): - site = pywikibot.getSite() + site = pywikibot.Site() page = pywikibot.Page(site, title) text = page.get() return mwparserfromhell.parse(text) @@ -22,16 +22,19 @@ If you're not using a library, you can parse any page using the following code (via the API_):: import json - import urllib.request + from urllib.parse import urlencode + from urllib.request import urlopen import mwparserfromhell API_URL = "http://en.wikipedia.org/w/api.php" def parse(title): - raw = urllib.request.urlopen(API_URL, data).read() + data = {"action": "query", "prop": "revisions", "rvlimit": 1, + "rvprop": "content", "format": "json", "titles": title} + raw = urlopen(API_URL, urlencode(data).encode()).read() res = json.loads(raw) text = res["query"]["pages"].values()[0]["revisions"][0]["*"] return mwparserfromhell.parse(text) .. _EarwigBot: https://github.com/earwig/earwigbot -.. _Pywikipedia: https://www.mediawiki.org/wiki/Manual:Pywikipediabot +.. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot .. _API: http://mediawiki.org/wiki/API From de325a0aea1f428a52639668d6cfa15cdac13e00 Mon Sep 17 00:00:00 2001 From: John Vandenberg Date: Sat, 10 Jan 2015 15:53:48 +0700 Subject: [PATCH 096/102] Issue #26 Use pure python on compilation failure Allow the compilation of the extension to fail, and switch to pure python mode. --- setup.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 68943ac..226c1cc 100644 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import os import sys if (sys.version_info[0] == 2 and sys.version_info[1] < 6) or \ @@ -39,7 +40,57 @@ tokenizer = Extension("mwparserfromhell.parser._tokenizer", sources=["mwparserfromhell/parser/tokenizer.c"], depends=["mwparserfromhell/parser/tokenizer.h"]) -setup( + +def optional_compile_setup(func=setup, use_ext=True, *args, **kwargs): + """ + Wrap setup to allow optional compilation of extensions. + + Falls back to pure python mode (no extensions) + if compilation of extensions fails. + """ + extensions = kwargs.get('ext_modules', None) + + if use_ext and extensions: + try: + func(*args, **kwargs) + return + except (Exception, SystemExit) as e: + print('Building extension failed: %s' % repr(e)) + + if extensions: + if use_ext: + print('Falling back to pure python mode.') + else: + print('Using pure python mode.') + + del kwargs['ext_modules'] + + # Basic algorithm to push the extension sources into + # the package as data. + ext_files = [(ext, filename) + for ext in extensions + for filename in ext.sources + ext.depends] + + pkg_data = kwargs.get('package_data', {}) + for ext, filename in ext_files: + ext_name_parts = ext.name.split('.') + pkg_name = '.'.join(ext_name_parts[0:-1]) + pkg = pkg_data.setdefault(pkg_name, []) + # This assumes the extension's package name + # is the same prefix as the filename. + pkg.append(os.path.basename(filename)) + + kwargs['package_data'] = pkg_data + + # Ensure the extension package is in the main packages list. + for name in pkg_data.keys(): + if name not in kwargs['packages']: + kwargs['packages'].append(name) + + func(*args, **kwargs) + + +optional_compile_setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)), ext_modules = [tokenizer], From a64bae35c9395389d2ef96adc37d828cd217b911 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 11 Jan 2015 23:57:28 -0500 Subject: [PATCH 097/102] Add support for a NOWEB env var, update docs. --- CHANGELOG | 8 ++++++-- docs/changelog.rst | 8 ++++++-- tests/test_docs.py | 2 ++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 848305d..1e9801b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -24,10 +24,14 @@ v0.4 (unreleased): - If something goes wrong while parsing, ParserError will now be raised. Previously, the parser would produce an unclear BadRoute exception or allow an incorrect node tree to be build. -- Fixed a parser bug involving nested tags, and another involving comments in - template names. +- Fixed parser bugs involving: + - nested tags; + - comments in template names; + - tags inside of tags. - Added tests to ensure that parsed trees convert back to wikicode without unintentional modifications. +- Added support for a NOWEB environment variable, which disables a unit test + that makes a web call. - Test coverage has been improved, and some minor related bugs have been fixed. - Updated and fixed some documentation. diff --git a/docs/changelog.rst b/docs/changelog.rst index a04410f..7ab211b 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -35,10 +35,14 @@ Unreleased - If something goes wrong while parsing, :exc:`.ParserError` will now be raised. Previously, the parser would produce an unclear :exc:`.BadRoute` exception or allow an incorrect node tree to be build. -- Fixed a parser bug involving nested tags, and another involving comments in - template names. +- Fixed parser bugs involving: + - nested tags; + - comments in template names; + - tags inside of ```` tags. - Added tests to ensure that parsed trees convert back to wikicode without unintentional modifications. +- Added support for a :envvar:`NOWEB` environment variable, which disables a + unit test that makes a web call. - Test coverage has been improved, and some minor related bugs have been fixed. - Updated and fixed some documentation. diff --git a/tests/test_docs.py b/tests/test_docs.py index c873f0e..566a281 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -22,6 +22,7 @@ from __future__ import print_function, unicode_literals import json +import os try: import unittest2 as unittest @@ -111,6 +112,7 @@ class TestDocs(unittest.TestCase): self.assertPrint(text, res) self.assertEqual(text, code) + @unittest.skipIf("NOWEB" in os.environ, "web test disabled by environ var") def test_readme_5(self): """test a block of example code in the README; includes a web call""" url1 = "http://en.wikipedia.org/w/api.php" From 4e8ce523858fb5d8777b4ab2ee89635fa721a08f Mon Sep 17 00:00:00 2001 From: John Vandenberg Date: Mon, 12 Jan 2015 18:33:26 +1100 Subject: [PATCH 098/102] Support 'setup.py test' and test without extension 'setup.py test' also uses SystemExit, with args[0] as False. Detect and re-raise. Add support for building without extension even when compiler is functional, and set up extension-less travis builds. --- .travis.yml | 5 +++++ setup.py | 59 ++++++++++++++++++++++++++++++++++------------------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/.travis.yml b/.travis.yml index c8dbb88..daa31ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,3 +12,8 @@ script: - coverage run --source=mwparserfromhell setup.py -q test after_success: - coveralls + +env: + matrix: + - WITHOUT_EXTENSION=0 + - WITHOUT_EXTENSION=1 diff --git a/setup.py b/setup.py index 226c1cc..761bb40 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,9 @@ if (sys.version_info[0] == 2 and sys.version_info[1] < 6) or \ (sys.version_info[1] == 3 and sys.version_info[1] < 2): raise Exception("mwparserfromhell needs Python 2.6+ or 3.2+") +if sys.version_info >= (3, 0): + basestring = (str, ) + from setuptools import setup, find_packages, Extension from mwparserfromhell import __version__ @@ -40,8 +43,25 @@ tokenizer = Extension("mwparserfromhell.parser._tokenizer", sources=["mwparserfromhell/parser/tokenizer.c"], depends=["mwparserfromhell/parser/tokenizer.h"]) +use_extension=True + +# Allow env var WITHOUT_EXTENSION and args --with[out]-extension +if '--without-extension' in sys.argv: + use_extension = False +elif '--with-extension' in sys.argv: + pass +elif os.environ.get('WITHOUT_EXTENSION', '0') == '1': + use_extension = False + +# Remove the command line argument as it isnt understood by +# setuptools/distutils +sys.argv = [arg for arg in sys.argv + if not arg.startswith('--with') + and not arg.endswith('-extension')] -def optional_compile_setup(func=setup, use_ext=True, *args, **kwargs): + +def optional_compile_setup(func=setup, use_ext=use_extension, + *args, **kwargs): """ Wrap setup to allow optional compilation of extensions. @@ -54,8 +74,19 @@ def optional_compile_setup(func=setup, use_ext=True, *args, **kwargs): try: func(*args, **kwargs) return - except (Exception, SystemExit) as e: - print('Building extension failed: %s' % repr(e)) + except SystemExit as e: + assert(e.args) + if e.args[0] is False: + raise + elif isinstance(e.args[0], basestring): + if e.args[0].startswith('usage: '): + raise + else: + # Fallback to pure python mode + print('setup with extension failed: %s' % repr(e)) + pass + except Exception as e: + print('setup with extension failed: %s' % repr(e)) if extensions: if use_ext: @@ -65,28 +96,6 @@ def optional_compile_setup(func=setup, use_ext=True, *args, **kwargs): del kwargs['ext_modules'] - # Basic algorithm to push the extension sources into - # the package as data. - ext_files = [(ext, filename) - for ext in extensions - for filename in ext.sources + ext.depends] - - pkg_data = kwargs.get('package_data', {}) - for ext, filename in ext_files: - ext_name_parts = ext.name.split('.') - pkg_name = '.'.join(ext_name_parts[0:-1]) - pkg = pkg_data.setdefault(pkg_name, []) - # This assumes the extension's package name - # is the same prefix as the filename. - pkg.append(os.path.basename(filename)) - - kwargs['package_data'] = pkg_data - - # Ensure the extension package is in the main packages list. - for name in pkg_data.keys(): - if name not in kwargs['packages']: - kwargs['packages'].append(name) - func(*args, **kwargs) From e71e7b4ece8c3185f9a0ae8a14ddc0995f470570 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 12 Jan 2015 02:40:40 -0500 Subject: [PATCH 099/102] Update copyright years for 2015; fix whitespace in docs. --- LICENSE | 2 +- docs/changelog.rst | 2 ++ docs/conf.py | 2 +- mwparserfromhell/__init__.py | 4 ++-- mwparserfromhell/definitions.py | 2 +- mwparserfromhell/nodes/__init__.py | 2 +- mwparserfromhell/nodes/argument.py | 2 +- mwparserfromhell/nodes/comment.py | 2 +- mwparserfromhell/nodes/external_link.py | 2 +- mwparserfromhell/nodes/extras/__init__.py | 2 +- mwparserfromhell/nodes/extras/attribute.py | 2 +- mwparserfromhell/nodes/extras/parameter.py | 2 +- mwparserfromhell/nodes/heading.py | 2 +- mwparserfromhell/nodes/html_entity.py | 2 +- mwparserfromhell/nodes/tag.py | 2 +- mwparserfromhell/nodes/template.py | 2 +- mwparserfromhell/nodes/text.py | 2 +- mwparserfromhell/nodes/wikilink.py | 2 +- mwparserfromhell/parser/__init__.py | 2 +- mwparserfromhell/parser/builder.py | 2 +- mwparserfromhell/parser/contexts.py | 2 +- mwparserfromhell/parser/tokenizer.c | 2 +- mwparserfromhell/parser/tokenizer.h | 2 +- mwparserfromhell/parser/tokenizer.py | 2 +- mwparserfromhell/parser/tokens.py | 2 +- mwparserfromhell/smart_list.py | 2 +- mwparserfromhell/string_mixin.py | 2 +- mwparserfromhell/utils.py | 2 +- mwparserfromhell/wikicode.py | 2 +- scripts/memtest.py | 2 +- setup.py | 2 +- tests/_test_tokenizer.py | 2 +- tests/_test_tree_equality.py | 2 +- tests/test_argument.py | 2 +- tests/test_attribute.py | 2 +- tests/test_builder.py | 2 +- tests/test_comment.py | 2 +- tests/test_ctokenizer.py | 2 +- tests/test_docs.py | 2 +- tests/test_external_link.py | 2 +- tests/test_heading.py | 2 +- tests/test_html_entity.py | 2 +- tests/test_parameter.py | 2 +- tests/test_parser.py | 2 +- tests/test_pytokenizer.py | 2 +- tests/test_roundtripping.py | 2 +- tests/test_smart_list.py | 2 +- tests/test_string_mixin.py | 2 +- tests/test_tag.py | 2 +- tests/test_template.py | 2 +- tests/test_text.py | 2 +- tests/test_tokens.py | 2 +- tests/test_utils.py | 2 +- tests/test_wikicode.py | 2 +- tests/test_wikilink.py | 2 +- 55 files changed, 57 insertions(+), 55 deletions(-) diff --git a/LICENSE b/LICENSE index 327905b..92f5e42 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2012-2014 Ben Kurtovic +Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/changelog.rst b/docs/changelog.rst index 7ab211b..2285a82 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -36,9 +36,11 @@ Unreleased raised. Previously, the parser would produce an unclear :exc:`.BadRoute` exception or allow an incorrect node tree to be build. - Fixed parser bugs involving: + - nested tags; - comments in template names; - tags inside of ```` tags. + - Added tests to ensure that parsed trees convert back to wikicode without unintentional modifications. - Added support for a :envvar:`NOWEB` environment variable, which disables a diff --git a/docs/conf.py b/docs/conf.py index dd1d6e1..3f82ea7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,7 @@ master_doc = 'index' # General information about the project. project = u'mwparserfromhell' -copyright = u'2012, 2013, 2014 Ben Kurtovic' +copyright = u'2012, 2013, 2014, 2015 Ben Kurtovic' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 287536a..1c50753 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2012, 2013, 2014 Ben Kurtovic" +__copyright__ = "Copyright (C) 2012, 2013, 2014, 2015 Ben Kurtovic" __license__ = "MIT License" __version__ = "0.4.dev0" __email__ = "ben.kurtovic@gmail.com" diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index af41f49..e0ba16b 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index 8e71c8b..d0258ca 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/argument.py b/mwparserfromhell/nodes/argument.py index a595dfb..39c33ae 100644 --- a/mwparserfromhell/nodes/argument.py +++ b/mwparserfromhell/nodes/argument.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/comment.py b/mwparserfromhell/nodes/comment.py index fcfd946..3e82be7 100644 --- a/mwparserfromhell/nodes/comment.py +++ b/mwparserfromhell/nodes/comment.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index f98a1e5..a07e985 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/extras/__init__.py b/mwparserfromhell/nodes/extras/__init__.py index 7c0262b..854fa45 100644 --- a/mwparserfromhell/nodes/extras/__init__.py +++ b/mwparserfromhell/nodes/extras/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 7d296dc..7c7dd56 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/extras/parameter.py b/mwparserfromhell/nodes/extras/parameter.py index 50c9ac0..48f610c 100644 --- a/mwparserfromhell/nodes/extras/parameter.py +++ b/mwparserfromhell/nodes/extras/parameter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/heading.py b/mwparserfromhell/nodes/heading.py index 696b5ee..0db56f3 100644 --- a/mwparserfromhell/nodes/heading.py +++ b/mwparserfromhell/nodes/heading.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index 95f1492..e7f1bbc 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index e3c7260..cf3b4a5 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index a9b14aa..7cbeb7d 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/text.py b/mwparserfromhell/nodes/text.py index 55c714e..e793c1f 100644 --- a/mwparserfromhell/nodes/text.py +++ b/mwparserfromhell/nodes/text.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/nodes/wikilink.py b/mwparserfromhell/nodes/wikilink.py index f9c221c..88eaacc 100644 --- a/mwparserfromhell/nodes/wikilink.py +++ b/mwparserfromhell/nodes/wikilink.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 36cb511..ae13c76 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index decbe60..ad29f4d 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 17912cb..e98d8f7 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 7d07ed8..c125021 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1,6 +1,6 @@ /* Tokenizer for MWParserFromHell -Copyright (C) 2012-2014 Ben Kurtovic +Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 33ba0e1..842e65d 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -1,6 +1,6 @@ /* Tokenizer Header File for MWParserFromHell -Copyright (C) 2012-2014 Ben Kurtovic +Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 607cc69..36c83e1 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 2e38a1c..4668780 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/smart_list.py b/mwparserfromhell/smart_list.py index b4cfd1b..c552050 100644 --- a/mwparserfromhell/smart_list.py +++ b/mwparserfromhell/smart_list.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/string_mixin.py b/mwparserfromhell/string_mixin.py index 8da8692..01809a7 100644 --- a/mwparserfromhell/string_mixin.py +++ b/mwparserfromhell/string_mixin.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 8f518a6..28823fc 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index ebfa9c7..c623971 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/scripts/memtest.py b/scripts/memtest.py index e6b8011..824d992 100644 --- a/scripts/memtest.py +++ b/scripts/memtest.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/setup.py b/setup.py index 68943ac..310b616 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index 17d588b..1cbbc3d 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py index bb713c2..086f113 100644 --- a/tests/_test_tree_equality.py +++ b/tests/_test_tree_equality.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_argument.py b/tests/test_argument.py index 3539ec4..70d8006 100644 --- a/tests/test_argument.py +++ b/tests/test_argument.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_attribute.py b/tests/test_attribute.py index 15e546d..b3e325d 100644 --- a/tests/test_attribute.py +++ b/tests/test_attribute.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_builder.py b/tests/test_builder.py index d4e6f73..9af4f21 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_comment.py b/tests/test_comment.py index cac8719..ad13f4a 100644 --- a/tests/test_comment.py +++ b/tests/test_comment.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_ctokenizer.py b/tests/test_ctokenizer.py index 52427e3..0d37485 100644 --- a/tests/test_ctokenizer.py +++ b/tests/test_ctokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_docs.py b/tests/test_docs.py index 566a281..d50e90e 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_external_link.py b/tests/test_external_link.py index c81470e..5137247 100644 --- a/tests/test_external_link.py +++ b/tests/test_external_link.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_heading.py b/tests/test_heading.py index 7c7a7ee..effc03b 100644 --- a/tests/test_heading.py +++ b/tests/test_heading.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_html_entity.py b/tests/test_html_entity.py index 3df596a..a13fd71 100644 --- a/tests/test_html_entity.py +++ b/tests/test_html_entity.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_parameter.py b/tests/test_parameter.py index 2a4bb75..71b298c 100644 --- a/tests/test_parameter.py +++ b/tests/test_parameter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_parser.py b/tests/test_parser.py index 955f455..6885c37 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_pytokenizer.py b/tests/test_pytokenizer.py index 40e2caf..f009c14 100644 --- a/tests/test_pytokenizer.py +++ b/tests/test_pytokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_roundtripping.py b/tests/test_roundtripping.py index 5360387..5c64535 100644 --- a/tests/test_roundtripping.py +++ b/tests/test_roundtripping.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_smart_list.py b/tests/test_smart_list.py index 13d96d2..a7106e4 100644 --- a/tests/test_smart_list.py +++ b/tests/test_smart_list.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_string_mixin.py b/tests/test_string_mixin.py index bc44f55..09e2e63 100644 --- a/tests/test_string_mixin.py +++ b/tests/test_string_mixin.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_tag.py b/tests/test_tag.py index 3beea98..0f0040a 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_template.py b/tests/test_template.py index e015a6a..7ba3f64 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_text.py b/tests/test_text.py index ee2e5c7..9093824 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_tokens.py b/tests/test_tokens.py index 3efce86..98f9a56 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_utils.py b/tests/test_utils.py index ddcc078..a9d4119 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 7a30a75..d97830c 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_wikilink.py b/tests/test_wikilink.py index 1bdc907..e95cd84 100644 --- a/tests/test_wikilink.py +++ b/tests/test_wikilink.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2014 Ben Kurtovic +# Copyright (C) 2012-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From ef18166c1240002def94295b4ea0bc046e186086 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 12 Jan 2015 18:26:13 -0500 Subject: [PATCH 100/102] Update changelog following #94. --- .travis.yml | 1 - CHANGELOG | 5 +++-- docs/changelog.rst | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index daa31ac..07dab97 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,6 @@ script: - coverage run --source=mwparserfromhell setup.py -q test after_success: - coveralls - env: matrix: - WITHOUT_EXTENSION=0 diff --git a/CHANGELOG b/CHANGELOG index 1e9801b..584ade4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,8 @@ v0.4 (unreleased): -- The parser is now distributed with Windows binaries, fixing an issue that - prevented Windows users from using the C tokenizer. +- The parser is now distributed with Windows binaries, and falls back on a pure + Python mode if C extensions cannot be built. This fixes an issue that + prevented some Windows users from installing the parser. - Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in scripts/memtest.py. - Added a script to do releases in scripts/release.sh. diff --git a/docs/changelog.rst b/docs/changelog.rst index 2285a82..16963b0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,8 +7,9 @@ v0.4 Unreleased (`changes `__): -- The parser is now distributed with Windows binaries, fixing an issue that - prevented Windows users from using the C tokenizer. +- The parser is now distributed with Windows binaries, and falls back on a pure + Python mode if C extensions cannot be built. This fixes an issue that + prevented some Windows users from installing the parser. - Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to do releases in :file:`scripts/release.sh`. From 432da1260f3ec2ae876b4de8cc02052dd43d1f7e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 23 May 2015 23:29:15 -0400 Subject: [PATCH 101/102] Changelog update for 0.4. --- CHANGELOG | 6 +++--- docs/changelog.rst | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 584ade4..0ab103a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,8 +1,8 @@ v0.4 (unreleased): -- The parser is now distributed with Windows binaries, and falls back on a pure - Python mode if C extensions cannot be built. This fixes an issue that - prevented some Windows users from installing the parser. +- The parser now falls back on pure Python mode if C extensions cannot be + built. This fixes an issue that prevented some Windows users from installing + the parser. - Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in scripts/memtest.py. - Added a script to do releases in scripts/release.sh. diff --git a/docs/changelog.rst b/docs/changelog.rst index 16963b0..9811b5c 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,9 +7,9 @@ v0.4 Unreleased (`changes `__): -- The parser is now distributed with Windows binaries, and falls back on a pure - Python mode if C extensions cannot be built. This fixes an issue that - prevented some Windows users from installing the parser. +- The parser now falls back on pure Python mode if C extensions cannot be + built. This fixes an issue that prevented some Windows users from installing + the parser. - Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to do releases in :file:`scripts/release.sh`. From 8f5f9b402db36aeb157318137972bd2196a4c19e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 23 May 2015 23:35:42 -0400 Subject: [PATCH 102/102] release/0.4 --- mwparserfromhell/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 1c50753..94b6e03 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.4.dev0" +__version__ = "0.4" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin,