Browse Source

Merge develop into master (release/0.6.3)

undefined
Ben Kurtovic 3 years ago
parent
commit
985aff5786
74 changed files with 4480 additions and 2901 deletions
  1. +13
    -0
      .clang-format
  2. +22
    -0
      .github/workflows/build-linux-wheels.yml
  3. +2
    -0
      .gitignore
  4. +9
    -0
      .pre-commit-config.yaml
  5. +6
    -0
      CHANGELOG
  6. +1
    -1
      appveyor.yml
  7. +12
    -0
      docs/changelog.rst
  8. +80
    -66
      docs/conf.py
  9. +11
    -6
      scripts/memtest.py
  10. +32
    -23
      setup.py
  11. +2
    -3
      src/mwparserfromhell/__init__.py
  12. +16
    -4
      src/mwparserfromhell/definitions.py
  13. +12
    -2
      src/mwparserfromhell/nodes/__init__.py
  14. +2
    -0
      src/mwparserfromhell/nodes/_base.py
  15. +1
    -0
      src/mwparserfromhell/nodes/argument.py
  16. +1
    -0
      src/mwparserfromhell/nodes/comment.py
  17. +2
    -0
      src/mwparserfromhell/nodes/external_link.py
  18. +10
    -2
      src/mwparserfromhell/nodes/extras/attribute.py
  19. +2
    -2
      src/mwparserfromhell/nodes/extras/parameter.py
  20. +1
    -0
      src/mwparserfromhell/nodes/heading.py
  21. +12
    -5
      src/mwparserfromhell/nodes/html_entity.py
  22. +32
    -8
      src/mwparserfromhell/nodes/tag.py
  23. +7
    -6
      src/mwparserfromhell/nodes/template.py
  24. +1
    -0
      src/mwparserfromhell/nodes/text.py
  25. +1
    -0
      src/mwparserfromhell/nodes/wikilink.py
  26. +5
    -0
      src/mwparserfromhell/parser/__init__.py
  27. +57
    -20
      src/mwparserfromhell/parser/builder.py
  28. +59
    -34
      src/mwparserfromhell/parser/contexts.py
  29. +525
    -510
      src/mwparserfromhell/parser/ctokenizer/avl_tree.c
  30. +166
    -177
      src/mwparserfromhell/parser/ctokenizer/avl_tree.h
  31. +34
    -33
      src/mwparserfromhell/parser/ctokenizer/common.h
  32. +11
    -5
      src/mwparserfromhell/parser/ctokenizer/contexts.h
  33. +31
    -23
      src/mwparserfromhell/parser/ctokenizer/definitions.c
  34. +5
    -6
      src/mwparserfromhell/parser/ctokenizer/definitions.h
  35. +20
    -14
      src/mwparserfromhell/parser/ctokenizer/tag_data.c
  36. +6
    -6
      src/mwparserfromhell/parser/ctokenizer/tag_data.h
  37. +45
    -26
      src/mwparserfromhell/parser/ctokenizer/textbuffer.c
  38. +8
    -8
      src/mwparserfromhell/parser/ctokenizer/textbuffer.h
  39. +984
    -771
      src/mwparserfromhell/parser/ctokenizer/tok_parse.c
  40. +4
    -3
      src/mwparserfromhell/parser/ctokenizer/tok_parse.h
  41. +122
    -79
      src/mwparserfromhell/parser/ctokenizer/tok_support.c
  42. +28
    -31
      src/mwparserfromhell/parser/ctokenizer/tok_support.h
  43. +96
    -69
      src/mwparserfromhell/parser/ctokenizer/tokenizer.c
  44. +56
    -47
      src/mwparserfromhell/parser/ctokenizer/tokenizer.h
  45. +42
    -44
      src/mwparserfromhell/parser/ctokenizer/tokens.c
  46. +38
    -38
      src/mwparserfromhell/parser/ctokenizer/tokens.h
  47. +2
    -0
      src/mwparserfromhell/parser/errors.py
  48. +166
    -99
      src/mwparserfromhell/parser/tokenizer.py
  49. +30
    -28
      src/mwparserfromhell/parser/tokens.py
  50. +4
    -4
      src/mwparserfromhell/smart_list/list_proxy.py
  51. +5
    -2
      src/mwparserfromhell/string_mixin.py
  52. +5
    -2
      src/mwparserfromhell/utils.py
  53. +57
    -22
      src/mwparserfromhell/wikicode.py
  54. +22
    -2
      tests/conftest.py
  55. +16
    -2
      tests/test_argument.py
  56. +5
    -0
      tests/test_attribute.py
  57. +737
    -326
      tests/test_builder.py
  58. +5
    -0
      tests/test_comment.py
  59. +12
    -6
      tests/test_docs.py
  60. +12
    -7
      tests/test_external_link.py
  61. +7
    -2
      tests/test_heading.py
  62. +9
    -0
      tests/test_html_entity.py
  63. +4
    -0
      tests/test_parameter.py
  64. +43
    -19
      tests/test_parser.py
  65. +27
    -2
      tests/test_smart_list.py
  66. +95
    -20
      tests/test_string_mixin.py
  67. +99
    -40
      tests/test_tag.py
  68. +267
    -140
      tests/test_template.py
  69. +5
    -0
      tests/test_text.py
  70. +26
    -14
      tests/test_tokenizer.py
  71. +9
    -5
      tests/test_tokens.py
  72. +23
    -18
      tests/test_utils.py
  73. +140
    -67
      tests/test_wikicode.py
  74. +16
    -2
      tests/test_wikilink.py

+ 13
- 0
.clang-format View File

@@ -0,0 +1,13 @@
BasedOnStyle: LLVM
AlignConsecutiveMacros: AcrossEmptyLines
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakAfterReturnType: TopLevelDefinitions
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Linux
ColumnLimit: 88
IndentPPDirectives: AfterHash
IndentWidth: 4
SpaceAfterCStyleCast: true
StatementMacros:
- PyObject_HEAD

+ 22
- 0
.github/workflows/build-linux-wheels.yml View File

@@ -23,3 +23,25 @@ jobs:
with:
user: __token__
password: ${{ secrets.pypi_password }}
build_aarch64:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: docker/setup-qemu-action@v1
name: Set up QEMU
- name: Build manylinux aarch64 Python wheels
uses: RalfG/python-wheels-manylinux-build@v0.3.4-manylinux2014_aarch64
with:
python-versions: 'cp36-cp36m cp37-cp37m cp38-cp38 cp39-cp39'
pip-wheel-args: '-w ./wheelhouse --no-deps'
- name: Move to dist/
run: |
mkdir -p dist
cp -v wheelhouse/*-manylinux*.whl dist/
- name: Publish package to PyPI
# Only actually publish if a new tag was pushed
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@37e305e7413032d8422456179fee28fac7d25187
with:
user: __token__
password: ${{ secrets.pypi_password }}

+ 2
- 0
.gitignore View File

@@ -13,4 +13,6 @@ dist
docs/_build
scripts/*.log
htmlcov/
compile_commands.json
.idea/
.pytest_cache/

+ 9
- 0
.pre-commit-config.yaml View File

@@ -0,0 +1,9 @@
repos:
- repo: https://github.com/psf/black
rev: 21.8b0
hooks:
- id: black
- repo: https://github.com/doublify/pre-commit-clang-format
rev: 62302476d0da01515660132d76902359bed0f782
hooks:
- id: clang-format

+ 6
- 0
CHANGELOG View File

@@ -1,3 +1,9 @@
v0.6.3 (released September 2, 2021):

- Added Linux AArch64 wheels. (#276)
- Fixed C integer conversion, manifesting as parsing errors on big-endian
platforms. (#277)

v0.6.2 (released May 16, 2021):

- Improved parsing of external links. (#232)


+ 1
- 1
appveyor.yml View File

@@ -1,6 +1,6 @@
# This config file is used by appveyor.com to build Windows release binaries

version: 0.6.2-b{build}
version: 0.6.3-b{build}

branches:
only:


+ 12
- 0
docs/changelog.rst View File

@@ -1,6 +1,18 @@
Changelog
=========

v0.6.3
------

`Released September 2, 2021 <https://github.com/earwig/mwparserfromhell/tree/v0.6.3>`_
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.6.2...v0.6.3>`__):

- Added Linux AArch64 wheels.
(`#276 <https://github.com/earwig/mwparserfromhell/issues/276>`_)
- Fixed C integer conversion, manifesting as parsing errors on big-endian
platforms.
(`#277 <https://github.com/earwig/mwparserfromhell/issues/277>`_)

v0.6.2
------



+ 80
- 66
docs/conf.py View File

@@ -16,33 +16,33 @@ import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath(".."))
import mwparserfromhell

# -- General configuration -----------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode']
extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.viewcode"]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
templates_path = ["_templates"]

# The suffix of source filenames.
source_suffix = '.rst'
source_suffix = ".rst"

# The encoding of source files.
#source_encoding = 'utf-8-sig'
# source_encoding = 'utf-8-sig'

# The master toctree document.
master_doc = 'index'
master_doc = "index"

# General information about the project.
project = u'mwparserfromhell'
copyright = u'2012–2021 Ben Kurtovic'
project = "mwparserfromhell"
copyright = "2012–2021 Ben Kurtovic"

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -55,158 +55,161 @@ release = mwparserfromhell.__version__

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# language = None

# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# today_fmt = '%B %d, %Y'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
exclude_patterns = ["_build"]

# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# default_role = None

# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# add_function_parentheses = True

# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# add_module_names = True

# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# show_authors = False

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
pygments_style = "sphinx"

# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# modindex_common_prefix = []


# -- Options for HTML output ---------------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'nature'
html_theme = "nature"

# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# html_theme_options = {}

# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# html_theme_path = []

# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# html_title = None

# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# html_short_title = None

# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# html_logo = None

# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# html_favicon = None

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = ["_static"]

# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# html_last_updated_fmt = '%b %d, %Y'

# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# html_use_smartypants = True

# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# html_sidebars = {}

# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# html_additional_pages = {}

# If false, no module index is generated.
#html_domain_indices = True
# html_domain_indices = True

# If false, no index is generated.
#html_use_index = True
# html_use_index = True

# If true, the index is split into individual pages for each letter.
#html_split_index = False
# html_split_index = False

# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# html_show_sourcelink = True

# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# html_show_sphinx = True

# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# html_show_copyright = True

# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# html_use_opensearch = ''

# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# html_file_suffix = None

# Output file base name for HTML help builder.
htmlhelp_basename = 'mwparserfromhelldoc'
htmlhelp_basename = "mwparserfromhelldoc"


# -- Options for LaTeX output --------------------------------------------------

latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',

# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',

# Additional stuff for the LaTeX preamble.
#'preamble': '',
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'mwparserfromhell.tex', u'mwparserfromhell Documentation',
u'Ben Kurtovic', 'manual'),
(
"index",
"mwparserfromhell.tex",
"mwparserfromhell Documentation",
"Ben Kurtovic",
"manual",
)
]

# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# latex_logo = None

# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# latex_use_parts = False

# If true, show page references after internal links.
#latex_show_pagerefs = False
# latex_show_pagerefs = False

# If true, show URL addresses after external links.
#latex_show_urls = False
# latex_show_urls = False

# Documents to append as an appendix to all manuals.
#latex_appendices = []
# latex_appendices = []

# If false, no module index is generated.
#latex_domain_indices = True
# latex_domain_indices = True


# -- Options for manual page output --------------------------------------------
@@ -214,12 +217,17 @@ latex_documents = [
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'mwparserfromhell', u'mwparserfromhell Documentation',
[u'Ben Kurtovic'], 1)
(
"index",
"mwparserfromhell",
"mwparserfromhell Documentation",
["Ben Kurtovic"],
1,
)
]

# If true, show URL addresses after external links.
#man_show_urls = False
# man_show_urls = False


# -- Options for Texinfo output ------------------------------------------------
@@ -228,20 +236,26 @@ man_pages = [
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'mwparserfromhell', u'mwparserfromhell Documentation',
u'Ben Kurtovic', 'mwparserfromhell', 'One line description of project.',
'Miscellaneous'),
(
"index",
"mwparserfromhell",
"mwparserfromhell Documentation",
"Ben Kurtovic",
"mwparserfromhell",
"One line description of project.",
"Miscellaneous",
)
]

# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# texinfo_appendices = []

# If false, no module index is generated.
#texinfo_domain_indices = True
# texinfo_domain_indices = True

# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# texinfo_show_urls = 'footnote'


# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'http://docs.python.org/': None}
intersphinx_mapping = {"http://docs.python.org/": None}

+ 11
- 6
scripts/memtest.py View File

@@ -41,6 +41,7 @@ from mwparserfromhell.parser._tokenizer import CTokenizer

LOOPS = 10000


class Color:
GRAY = "\x1b[30;1m"
GREEN = "\x1b[92m"
@@ -63,11 +64,11 @@ class MemoryTest:
data = {"name": None, "label": None, "input": None, "output": None}
for line in test.strip().splitlines():
if line.startswith("name:"):
data["name"] = line[len("name:"):].strip()
data["name"] = line[len("name:") :].strip()
elif line.startswith("label:"):
data["label"] = line[len("label:"):].strip()
data["label"] = line[len("label:") :].strip()
elif line.startswith("input:"):
raw = line[len("input:"):].strip()
raw = line[len("input:") :].strip()
if raw[0] == '"' and raw[-1] == '"':
raw = raw[1:-1]
raw = raw.encode("raw_unicode_escape")
@@ -81,7 +82,7 @@ class MemoryTest:
def load_file(filename):
with open(filename, "rU") as fp:
text = fp.read()
name = path.split(filename)[1][:0-len(extension)]
name = path.split(filename)[1][: 0 - len(extension)]
self._parse_file(name, text)

root = path.split(path.dirname(path.abspath(__file__)))[0]
@@ -119,8 +120,11 @@ class MemoryTest:

tmpl = "{0}[{1:03}/{2}]{3} {4}: "
for i, (name, text) in enumerate(self._tests, 1):
sys.stdout.write(tmpl.format(Color.GRAY, i, len(self._tests),
Color.RESET, name.ljust(width)))
sys.stdout.write(
tmpl.format(
Color.GRAY, i, len(self._tests), Color.RESET, name.ljust(width)
)
)
sys.stdout.flush()
parent, child = Pipe()
p = Process(target=_runner, args=(text, child))
@@ -156,6 +160,7 @@ def _runner(text, child):
child.send("OK")
child.recv()


if __name__ == "__main__":
setlocale(LC_ALL, "")
MemoryTest().run()

+ 32
- 23
setup.py View File

@@ -52,8 +52,10 @@ elif env_var is not None:

# Remove the command line argument as it isn't understood by setuptools:

sys.argv = [arg for arg in sys.argv
if arg not in ("--without-extension", "--with-extension")]
sys.argv = [
arg for arg in sys.argv if arg not in ("--without-extension", "--with-extension")
]


def build_ext_patched(self):
try:
@@ -63,33 +65,40 @@ def build_ext_patched(self):
print("Falling back to pure Python mode.")
del self.extensions[:]


if fallback:
build_ext.run, build_ext_original = build_ext_patched, build_ext.run

# Project-specific part begins here:

tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.c")),
depends=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.h")))
tokenizer = Extension(
"mwparserfromhell.parser._tokenizer",
sources=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.c")),
depends=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.h")),
)

setup(
name = "mwparserfromhell",
packages = find_packages("src"),
package_dir = {"": "src"},
ext_modules = [tokenizer] if use_extension else [],
setup_requires = ["pytest-runner"] if "test" in sys.argv or "pytest" in sys.argv else [],
tests_require = ["pytest"],
version = __version__,
python_requires = ">= 3.5",
author = "Ben Kurtovic",
author_email = "ben.kurtovic@gmail.com",
url = "https://github.com/earwig/mwparserfromhell",
description = "MWParserFromHell is a parser for MediaWiki wikicode.",
long_description = long_docs,
download_url = "https://github.com/earwig/mwparserfromhell/tarball/v{}".format(__version__),
keywords = "earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing",
license = "MIT License",
classifiers = [
name="mwparserfromhell",
packages=find_packages("src"),
package_dir={"": "src"},
ext_modules=[tokenizer] if use_extension else [],
setup_requires=["pytest-runner"]
if "test" in sys.argv or "pytest" in sys.argv
else [],
tests_require=["pytest"],
version=__version__,
python_requires=">= 3.5",
author="Ben Kurtovic",
author_email="ben.kurtovic@gmail.com",
url="https://github.com/earwig/mwparserfromhell",
description="MWParserFromHell is a parser for MediaWiki wikicode.",
long_description=long_docs,
download_url="https://github.com/earwig/mwparserfromhell/tarball/v{}".format(
__version__
),
keywords="earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing",
license="MIT License",
classifiers=[
"Development Status :: 4 - Beta",
"Environment :: Console",
"Intended Audience :: Developers",
@@ -101,6 +110,6 @@ setup(
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Text Processing :: Markup"
"Topic :: Text Processing :: Markup",
],
)

+ 2
- 3
src/mwparserfromhell/__init__.py View File

@@ -27,10 +27,9 @@ outrageously powerful parser for `MediaWiki <https://www.mediawiki.org>`_ wikico
__author__ = "Ben Kurtovic"
__copyright__ = "Copyright (C) 2012-2021 Ben Kurtovic"
__license__ = "MIT License"
__version__ = "0.6.2"
__version__ = "0.6.3"
__email__ = "ben.kurtovic@gmail.com"

from . import (definitions, nodes, parser, smart_list, string_mixin,
utils, wikicode)
from . import definitions, nodes, parser, smart_list, string_mixin, utils, wikicode

parse = utils.parse_anything

+ 16
- 4
src/mwparserfromhell/definitions.py View File

@@ -26,8 +26,14 @@ When updating this file, please also update the the C tokenizer version:
- mwparserfromhell/parser/ctokenizer/definitions.h
"""

__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only", "is_scheme"]
__all__ = [
"get_html_tag",
"is_parsable",
"is_visible",
"is_single",
"is_single_only",
"is_scheme",
]

URI_SCHEMES = {
# [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0
@@ -92,7 +98,7 @@ INVISIBLE_TAGS = [
"score",
"section",
"templatedata",
"timeline"
"timeline",
]

# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645
@@ -103,29 +109,35 @@ MARKUP_TO_HTML = {
"#": "li",
"*": "li",
";": "dt",
":": "dd"
":": "dd",
}


def get_html_tag(markup):
"""Return the HTML tag associated with the given wiki-markup."""
return MARKUP_TO_HTML[markup]


def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser."""
return tag.lower() not in PARSER_BLACKLIST


def is_visible(tag):
"""Return whether or not the given *tag* contains visible text."""
return tag.lower() not in INVISIBLE_TAGS


def is_single(tag):
"""Return whether or not the given *tag* can exist without a close tag."""
return tag.lower() in SINGLE


def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY


def is_scheme(scheme, slashes=True):
"""Return whether *scheme* is valid for external links."""
scheme = scheme.lower()


+ 12
- 2
src/mwparserfromhell/nodes/__init__.py View File

@@ -39,5 +39,15 @@ from .tag import Tag
from .template import Template
from .wikilink import Wikilink

__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading",
"Node", "Tag", "Template", "Text", "Wikilink"]
__all__ = [
"Argument",
"Comment",
"ExternalLink",
"HTMLEntity",
"Heading",
"Node",
"Tag",
"Template",
"Text",
"Wikilink",
]

+ 2
- 0
src/mwparserfromhell/nodes/_base.py View File

@@ -22,6 +22,7 @@ from ..string_mixin import StringMixIn

__all__ = ["Node"]


class Node(StringMixIn):
"""Represents the base Node type, demonstrating the methods to override.

@@ -35,6 +36,7 @@ class Node(StringMixIn):
:meth:`__showtree__` can be overridden to build a nice tree representation
of the node, if desired, for :meth:`~.Wikicode.get_tree`.
"""

def __str__(self):
raise NotImplementedError()



+ 1
- 0
src/mwparserfromhell/nodes/argument.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything

__all__ = ["Argument"]


class Argument(Node):
"""Represents a template argument substitution, like ``{{{foo}}}``."""



+ 1
- 0
src/mwparserfromhell/nodes/comment.py View File

@@ -23,6 +23,7 @@ from ._base import Node

__all__ = ["Comment"]


class Comment(Node):
"""Represents a hidden HTML comment, like ``<!-- foobar -->``."""



+ 2
- 0
src/mwparserfromhell/nodes/external_link.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything

__all__ = ["ExternalLink"]


class ExternalLink(Node):
"""Represents an external link, like ``[http://example.com/ Example]``."""

@@ -83,6 +84,7 @@ class ExternalLink(Node):
def url(self, value):
# pylint: disable=import-outside-toplevel
from ..parser import contexts

self._url = parse_anything(value, contexts.EXT_LINK_URI)

@title.setter


+ 10
- 2
src/mwparserfromhell/nodes/extras/attribute.py View File

@@ -24,6 +24,7 @@ from ...utils import parse_anything

__all__ = ["Attribute"]


class Attribute(StringMixIn):
"""Represents an attribute of an HTML tag.

@@ -32,8 +33,15 @@ class Attribute(StringMixIn):
whose value is ``"foo"``.
"""

def __init__(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
def __init__(
self,
name,
value=None,
quotes='"',
pad_first=" ",
pad_before_eq="",
pad_after_eq="",
):
super().__init__()
self.name = name
self._quotes = None


+ 2
- 2
src/mwparserfromhell/nodes/extras/parameter.py View File

@@ -25,6 +25,7 @@ from ...utils import parse_anything

__all__ = ["Parameter"]


class Parameter(StringMixIn):
"""Represents a paramater of a template.

@@ -77,6 +78,5 @@ class Parameter(StringMixIn):
def showkey(self, newval):
newval = bool(newval)
if not newval and not self.can_hide_key(self.name):
raise ValueError("parameter key {!r} cannot be hidden".format(
self.name))
raise ValueError("parameter key {!r} cannot be hidden".format(self.name))
self._showkey = newval

+ 1
- 0
src/mwparserfromhell/nodes/heading.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything

__all__ = ["Heading"]


class Heading(Node):
"""Represents a section heading in wikicode, like ``== Foo ==``."""



+ 12
- 5
src/mwparserfromhell/nodes/html_entity.py View File

@@ -24,6 +24,7 @@ from ._base import Node

__all__ = ["HTMLEntity"]


class HTMLEntity(Node):
"""Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""

@@ -101,19 +102,23 @@ class HTMLEntity(Node):
except ValueError:
if newval not in htmlentities.entitydefs:
raise ValueError(
"entity value {!r} is not a valid name".format(newval)) from None
"entity value {!r} is not a valid name".format(newval)
) from None
self._named = True
self._hexadecimal = False
else:
if intval < 0 or intval > 0x10FFFF:
raise ValueError(
"entity value 0x{:x} is not in range(0x110000)".format(intval)) from None
"entity value 0x{:x} is not in range(0x110000)".format(intval)
) from None
self._named = False
self._hexadecimal = True
else:
test = int(newval, 16 if self.hexadecimal else 10)
if test < 0 or test > 0x10FFFF:
raise ValueError("entity value {} is not in range(0x110000)".format(test))
raise ValueError(
"entity value {} is not in range(0x110000)".format(test)
)
self._named = False
self._value = newval

@@ -126,8 +131,10 @@ class HTMLEntity(Node):
try:
int(self.value, 16)
except ValueError as exc:
raise ValueError("current entity value {!r} is not a valid "
"Unicode codepoint".format(self.value)) from exc
raise ValueError(
"current entity value {!r} is not a valid "
"Unicode codepoint".format(self.value)
) from exc
self._named = newval

@hexadecimal.setter


+ 32
- 8
src/mwparserfromhell/nodes/tag.py View File

@@ -26,13 +26,24 @@ from ..utils import parse_anything

__all__ = ["Tag"]


class Tag(Node):
"""Represents an HTML-style tag in wikicode, like ``<ref>``."""

def __init__(self, tag, contents=None, attrs=None, wiki_markup=None,
self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None, wiki_style_separator=None,
closing_wiki_markup=None):
def __init__(
self,
tag,
contents=None,
attrs=None,
wiki_markup=None,
self_closing=False,
invalid=False,
implicit=False,
padding="",
closing_tag=None,
wiki_style_separator=None,
closing_wiki_markup=None,
):
super().__init__()
self.tag = tag
self.contents = contents
@@ -60,8 +71,14 @@ class Tag(Node):
if self.self_closing:
return self.wiki_markup + attrs + padding + separator
close = self.closing_wiki_markup or ""
return self.wiki_markup + attrs + padding + separator + \
str(self.contents) + close
return (
self.wiki_markup
+ attrs
+ padding
+ separator
+ str(self.contents)
+ close
)

result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes:
@@ -270,8 +287,15 @@ class Tag(Node):
return attr
raise ValueError(name)

def add(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
def add(
self,
name,
value=None,
quotes='"',
pad_first=" ",
pad_before_eq="",
pad_after_eq="",
):
"""Add an attribute with the given *name* and *value*.

*name* and *value* can be anything parsable by


+ 7
- 6
src/mwparserfromhell/nodes/template.py View File

@@ -33,6 +33,7 @@ FLAGS = re.DOTALL | re.UNICODE
# Used to allow None as a valid fallback value
_UNSET = object()


class Template(Node):
"""Represents a template in wikicode, like ``{{foo}}``."""

@@ -153,7 +154,7 @@ class Template(Node):
def _fix_dependendent_params(self, i):
"""Unhide keys if necessary after removing the param at index *i*."""
if not self.params[i].showkey:
for param in self.params[i + 1:]:
for param in self.params[i + 1 :]:
if not param.showkey:
param.showkey = True

@@ -175,9 +176,10 @@ class Template(Node):
If one exists, we should remove the given one rather than blanking it.
"""
if self.params[i].showkey:
following = self.params[i + 1:]
better_matches = [after.name.strip() == name and not after.showkey
for after in following]
following = self.params[i + 1 :]
better_matches = [
after.name.strip() == name and not after.showkey for after in following
]
return any(better_matches)
return False

@@ -235,8 +237,7 @@ class Template(Node):
def __getitem__(self, name):
return self.get(name)

def add(self, name, value, showkey=None, before=None,
preserve_spacing=True):
def add(self, name, value, showkey=None, before=None, preserve_spacing=True):
"""Add a parameter to the template with a given *name* and *value*.

*name* and *value* can be anything parsable by


+ 1
- 0
src/mwparserfromhell/nodes/text.py View File

@@ -23,6 +23,7 @@ from ._base import Node

__all__ = ["Text"]


class Text(Node):
"""Represents ordinary, unformatted text with no special properties."""



+ 1
- 0
src/mwparserfromhell/nodes/wikilink.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything

__all__ = ["Wikilink"]


class Wikilink(Node):
"""Represents an internal wikilink, like ``[[Foo|Bar]]``."""



+ 5
- 0
src/mwparserfromhell/parser/__init__.py View File

@@ -26,16 +26,20 @@ together into one interface.

from .builder import Builder
from .errors import ParserError

try:
from ._tokenizer import CTokenizer

use_c = True
except ImportError:
from .tokenizer import Tokenizer

CTokenizer = None
use_c = False

__all__ = ["use_c", "Parser", "ParserError"]


class Parser:
"""Represents a parser for wikicode.

@@ -57,6 +61,7 @@ class Parser:
self._tokenizer = CTokenizer()
else:
from .tokenizer import Tokenizer

self._tokenizer = Tokenizer()
self._builder = Builder()



+ 57
- 20
src/mwparserfromhell/parser/builder.py View File

@@ -21,24 +21,34 @@

from . import tokens
from .errors import ParserError
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
Template, Text, Wikilink)
from ..nodes import (
Argument,
Comment,
ExternalLink,
Heading,
HTMLEntity,
Tag,
Template,
Text,
Wikilink,
)
from ..nodes.extras import Attribute, Parameter
from ..smart_list import SmartList
from ..wikicode import Wikicode

__all__ = ["Builder"]

_HANDLERS = {
tokens.Text: lambda self, token: Text(token.text)
}
_HANDLERS = {tokens.Text: lambda self, token: Text(token.text)}


def _add_handler(token_type):
"""Create a decorator that adds a handler function to the lookup table."""

def decorator(func):
"""Add a handler function to the lookup table."""
_HANDLERS[token_type] = func
return func

return decorator


@@ -84,8 +94,9 @@ class Builder:
key = self._pop()
showkey = True
self._push()
elif isinstance(token, (tokens.TemplateParamSeparator,
tokens.TemplateClose)):
elif isinstance(
token, (tokens.TemplateParamSeparator, tokens.TemplateClose)
):
self._tokens.append(token)
value = self._pop()
if key is None:
@@ -167,10 +178,17 @@ class Builder:
self._push()
elif isinstance(token, tokens.ExternalLinkClose):
if url is not None:
return ExternalLink(url, self._pop(), brackets=brackets,
suppress_space=suppress_space is True)
return ExternalLink(self._pop(), brackets=brackets,
suppress_space=suppress_space is True)
return ExternalLink(
url,
self._pop(),
brackets=brackets,
suppress_space=suppress_space is True,
)
return ExternalLink(
self._pop(),
brackets=brackets,
suppress_space=suppress_space is True,
)
else:
self._write(self._handle_token(token))
raise ParserError("_handle_external_link() missed a close token")
@@ -184,8 +202,9 @@ class Builder:
if isinstance(token, tokens.HTMLEntityHex):
text = self._tokens.pop()
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(text.text, named=False, hexadecimal=True,
hex_char=token.char)
return HTMLEntity(
text.text, named=False, hexadecimal=True, hex_char=token.char
)
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=False, hexadecimal=False)
self._tokens.pop() # Remove HTMLEntityEnd
@@ -227,15 +246,23 @@ class Builder:
self._push()
elif isinstance(token, tokens.TagAttrQuote):
quotes = token.char
elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
tokens.TagCloseSelfclose)):
elif isinstance(
token,
(tokens.TagAttrStart, tokens.TagCloseOpen, tokens.TagCloseSelfclose),
):
self._tokens.append(token)
if name:
value = self._pop()
else:
name, value = self._pop(), None
return Attribute(name, value, quotes, start.pad_first,
start.pad_before_eq, start.pad_after_eq)
return Attribute(
name,
value,
quotes,
start.pad_first,
start.pad_before_eq,
start.pad_after_eq,
)
else:
self._write(self._handle_token(token))
raise ParserError("_handle_attribute() missed a close token")
@@ -271,9 +298,19 @@ class Builder:
else:
self_closing = False
closing_tag = self._pop()
return Tag(tag, contents, attrs, wiki_markup, self_closing,
invalid, implicit, padding, closing_tag,
wiki_style_separator, closing_wiki_markup)
return Tag(
tag,
contents,
attrs,
wiki_markup,
self_closing,
invalid,
implicit,
padding,
closing_tag,
wiki_style_separator,
closing_wiki_markup,
)
else:
self._write(self._handle_token(token))
raise ParserError("_handle_tag() missed a close token")


+ 59
- 34
src/mwparserfromhell/parser/contexts.py View File

@@ -116,21 +116,21 @@ Aggregate contexts:

# Local contexts:

TEMPLATE_NAME = 1 << 0
TEMPLATE_PARAM_KEY = 1 << 1
TEMPLATE_NAME = 1 << 0
TEMPLATE_PARAM_KEY = 1 << 1
TEMPLATE_PARAM_VALUE = 1 << 2
TEMPLATE = TEMPLATE_NAME + TEMPLATE_PARAM_KEY + TEMPLATE_PARAM_VALUE

ARGUMENT_NAME = 1 << 3
ARGUMENT_NAME = 1 << 3
ARGUMENT_DEFAULT = 1 << 4
ARGUMENT = ARGUMENT_NAME + ARGUMENT_DEFAULT

WIKILINK_TITLE = 1 << 5
WIKILINK_TEXT = 1 << 6
WIKILINK_TEXT = 1 << 6
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT

EXT_LINK_URI = 1 << 7
EXT_LINK_TITLE = 1 << 8
EXT_LINK_URI = 1 << 7
EXT_LINK_TITLE = 1 << 8
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE

HEADING_LEVEL_1 = 1 << 9
@@ -139,42 +139,61 @@ HEADING_LEVEL_3 = 1 << 11
HEADING_LEVEL_4 = 1 << 12
HEADING_LEVEL_5 = 1 << 13
HEADING_LEVEL_6 = 1 << 14
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 +
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6)

TAG_OPEN = 1 << 15
TAG_ATTR = 1 << 16
TAG_BODY = 1 << 17
HEADING = (
HEADING_LEVEL_1
+ HEADING_LEVEL_2
+ HEADING_LEVEL_3
+ HEADING_LEVEL_4
+ HEADING_LEVEL_5
+ HEADING_LEVEL_6
)

TAG_OPEN = 1 << 15
TAG_ATTR = 1 << 16
TAG_BODY = 1 << 17
TAG_CLOSE = 1 << 18
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE

STYLE_ITALICS = 1 << 19
STYLE_BOLD = 1 << 20
STYLE_PASS_AGAIN = 1 << 21
STYLE_SECOND_PASS = 1 << 22
STYLE_ITALICS = 1 << 19
STYLE_BOLD = 1 << 20
STYLE_PASS_AGAIN = 1 << 21
STYLE_SECOND_PASS = 1 << 22
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS

DL_TERM = 1 << 23

HAS_TEXT = 1 << 24
FAIL_ON_TEXT = 1 << 25
FAIL_NEXT = 1 << 26
HAS_TEXT = 1 << 24
FAIL_ON_TEXT = 1 << 25
FAIL_NEXT = 1 << 26
FAIL_ON_LBRACE = 1 << 27
FAIL_ON_RBRACE = 1 << 28
FAIL_ON_EQUALS = 1 << 29
HAS_TEMPLATE = 1 << 30
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE)

TABLE_OPEN = 1 << 31
TABLE_CELL_OPEN = 1 << 32
HAS_TEMPLATE = 1 << 30
SAFETY_CHECK = (
HAS_TEXT
+ FAIL_ON_TEXT
+ FAIL_NEXT
+ FAIL_ON_LBRACE
+ FAIL_ON_RBRACE
+ FAIL_ON_EQUALS
+ HAS_TEMPLATE
)

TABLE_OPEN = 1 << 31
TABLE_CELL_OPEN = 1 << 32
TABLE_CELL_STYLE = 1 << 33
TABLE_ROW_OPEN = 1 << 34
TABLE_TD_LINE = 1 << 35
TABLE_TH_LINE = 1 << 36
TABLE_ROW_OPEN = 1 << 34
TABLE_TD_LINE = 1 << 35
TABLE_TH_LINE = 1 << 36
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
TABLE_TD_LINE + TABLE_TH_LINE)
TABLE = (
TABLE_OPEN
+ TABLE_CELL_OPEN
+ TABLE_CELL_STYLE
+ TABLE_ROW_OPEN
+ TABLE_TD_LINE
+ TABLE_TH_LINE
)

HTML_ENTITY = 1 << 37

@@ -184,14 +203,20 @@ GL_HEADING = 1 << 0

# Aggregate contexts:

FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG +
STYLE + TABLE)
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE
UNSAFE = (
TEMPLATE_NAME
+ WIKILINK_TITLE
+ EXT_LINK_TITLE
+ TEMPLATE_PARAM_KEY
+ ARGUMENT_NAME
+ TAG_CLOSE
)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK


def describe(context):
"""Return a string describing the given context value, for debugging."""
flags = []


+ 525
- 510
src/mwparserfromhell/parser/ctokenizer/avl_tree.c
File diff suppressed because it is too large
View File


+ 166
- 177
src/mwparserfromhell/parser/ctokenizer/avl_tree.h View File

@@ -1,6 +1,6 @@
/*
* avl_tree.h - intrusive, nonrecursive AVL tree data structure (self-balancing
* binary search tree), header file
* binary search tree), header file
*
* Written in 2014-2016 by Eric Biggers <ebiggers3@gmail.com>
* Slight changes for compatibility by Ben Kurtovic <ben.kurtovic@gmail.com>
@@ -24,60 +24,60 @@
#include <stddef.h>

#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
#include <stdint.h>
# include <stdint.h>
#endif

#ifdef __GNUC__
# define AVL_INLINE inline __attribute__((always_inline))
# define AVL_INLINE inline __attribute__((always_inline))
#elif defined(_MSC_VER) && (_MSC_VER < 1900)
# define AVL_INLINE __inline
# define AVL_INLINE __inline
#else
# define AVL_INLINE inline
# define AVL_INLINE inline
#endif

/* Node in an AVL tree. Embed this in some other data structure. */
struct avl_tree_node {

/* Pointer to left child or NULL */
struct avl_tree_node *left;
/* Pointer to left child or NULL */
struct avl_tree_node *left;

/* Pointer to right child or NULL */
struct avl_tree_node *right;
/* Pointer to right child or NULL */
struct avl_tree_node *right;

/* Pointer to parent combined with the balance factor. This saves 4 or
* 8 bytes of memory depending on the CPU architecture.
*
* Low 2 bits: One greater than the balance factor of this subtree,
* which is equal to height(right) - height(left). The mapping is:
*
* 00 => -1
* 01 => 0
* 10 => +1
* 11 => undefined
*
* The rest of the bits are the pointer to the parent node. It must be
* 4-byte aligned, and it will be NULL if this is the root node and
* therefore has no parent. */
uintptr_t parent_balance;
/* Pointer to parent combined with the balance factor. This saves 4 or
* 8 bytes of memory depending on the CPU architecture.
*
* Low 2 bits: One greater than the balance factor of this subtree,
* which is equal to height(right) - height(left). The mapping is:
*
* 00 => -1
* 01 => 0
* 10 => +1
* 11 => undefined
*
* The rest of the bits are the pointer to the parent node. It must be
* 4-byte aligned, and it will be NULL if this is the root node and
* therefore has no parent. */
uintptr_t parent_balance;
};

/* Cast an AVL tree node to the containing data structure. */
#define avl_tree_entry(entry, type, member) \
((type*) ((char *)(entry) - offsetof(type, member)))
#define avl_tree_entry(entry, type, member) \
((type *) ((char *) (entry) -offsetof(type, member)))

/* Returns a pointer to the parent of the specified AVL tree node, or NULL if it
* is already the root of the tree. */
static AVL_INLINE struct avl_tree_node *
avl_get_parent(const struct avl_tree_node *node)
{
return (struct avl_tree_node *)(node->parent_balance & ~3);
return (struct avl_tree_node *) (node->parent_balance & ~3);
}

/* Marks the specified AVL tree node as unlinked from any tree. */
static AVL_INLINE void
avl_tree_node_set_unlinked(struct avl_tree_node *node)
{
node->parent_balance = (uintptr_t)node;
node->parent_balance = (uintptr_t) node;
}

/* Returns true iff the specified AVL tree node has been marked with
@@ -86,30 +86,29 @@ avl_tree_node_set_unlinked(struct avl_tree_node *node)
static AVL_INLINE int
avl_tree_node_is_unlinked(const struct avl_tree_node *node)
{
return node->parent_balance == (uintptr_t)node;
return node->parent_balance == (uintptr_t) node;
}

/* (Internal use only) */
extern void
avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *inserted);
extern void avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *inserted);

/*
* Looks up an item in the specified AVL tree.
*
* @root
* Pointer to the root of the AVL tree. (This can be NULL --- that just
* means the tree is empty.)
* Pointer to the root of the AVL tree. (This can be NULL --- that just
* means the tree is empty.)
*
* @cmp_ctx
* First argument to pass to the comparison callback. This generally
* should be a pointer to an object equal to the one being searched for.
* First argument to pass to the comparison callback. This generally
* should be a pointer to an object equal to the one being searched for.
*
* @cmp
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @cmp_ctx and the second
* argument will be a pointer to the AVL tree node of an item in the tree.
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @cmp_ctx and the second
* argument will be a pointer to the AVL tree node of an item in the tree.
*
* Returns a pointer to the AVL tree node of the resulting item, or NULL if the
* item was not found.
@@ -117,48 +116,49 @@ avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
* Example:
*
* struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* };
*
* static int _avl_cmp_int_to_node(const void *intptr,
* const struct avl_tree_node *nodeptr)
* const struct avl_tree_node *nodeptr)
* {
* int n1 = *(const int *)intptr;
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data;
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* int n1 = *(const int *)intptr;
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data;
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* }
*
* bool contains_int(struct avl_tree_node *root, int n)
* {
* struct avl_tree_node *result;
* struct avl_tree_node *result;
*
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node);
* return result ? true : false;
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node);
* return result ? true : false;
* }
*/
static AVL_INLINE struct avl_tree_node *
avl_tree_lookup(const struct avl_tree_node *root,
const void *cmp_ctx,
int (*cmp)(const void *, const struct avl_tree_node *))
const void *cmp_ctx,
int (*cmp)(const void *, const struct avl_tree_node *))
{
const struct avl_tree_node *cur = root;
const struct avl_tree_node *cur = root;

while (cur) {
int res = (*cmp)(cmp_ctx, cur);
if (res < 0)
cur = cur->left;
else if (res > 0)
cur = cur->right;
else
break;
}
return (struct avl_tree_node*)cur;
while (cur) {
int res = (*cmp)(cmp_ctx, cur);
if (res < 0) {
cur = cur->left;
} else if (res > 0) {
cur = cur->right;
} else {
break;
}
}
return (struct avl_tree_node *) cur;
}

/* Same as avl_tree_lookup(), but uses a more specific type for the comparison
@@ -167,44 +167,45 @@ avl_tree_lookup(const struct avl_tree_node *root,
* embedded 'struct avl_tree_node'. */
static AVL_INLINE struct avl_tree_node *
avl_tree_lookup_node(const struct avl_tree_node *root,
const struct avl_tree_node *node,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
const struct avl_tree_node *node,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
{
const struct avl_tree_node *cur = root;
const struct avl_tree_node *cur = root;

while (cur) {
int res = (*cmp)(node, cur);
if (res < 0)
cur = cur->left;
else if (res > 0)
cur = cur->right;
else
break;
}
return (struct avl_tree_node*)cur;
while (cur) {
int res = (*cmp)(node, cur);
if (res < 0) {
cur = cur->left;
} else if (res > 0) {
cur = cur->right;
} else {
break;
}
}
return (struct avl_tree_node *) cur;
}

/*
* Inserts an item into the specified AVL tree.
*
* @root_ptr
* Location of the AVL tree's root pointer. Indirection is needed because
* the root node may change as a result of rotations caused by the
* insertion. Initialize *root_ptr to NULL for an empty tree.
* Location of the AVL tree's root pointer. Indirection is needed because
* the root node may change as a result of rotations caused by the
* insertion. Initialize *root_ptr to NULL for an empty tree.
*
* @item
* Pointer to the `struct avl_tree_node' embedded in the item to insert.
* No members in it need be pre-initialized, although members in the
* containing structure should be pre-initialized so that @cmp can use them
* in comparisons.
* Pointer to the `struct avl_tree_node' embedded in the item to insert.
* No members in it need be pre-initialized, although members in the
* containing structure should be pre-initialized so that @cmp can use them
* in comparisons.
*
* @cmp
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @item and the second
* argument will be a pointer to an AVL tree node embedded in some
* previously-inserted item to which @item is being compared.
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @item and the second
* argument will be a pointer to an AVL tree node embedded in some
* previously-inserted item to which @item is being compared.
*
* If no item in the tree is comparatively equal (via @cmp) to @item, inserts
* @item and returns NULL. Otherwise does nothing and returns a pointer to the
@@ -214,150 +215,138 @@ avl_tree_lookup_node(const struct avl_tree_node *root,
* Example:
*
* struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* };
*
* #define GET_DATA(i) avl_tree_entry((i), struct int_wrapper, index_node)->data
*
* static int _avl_cmp_ints(const struct avl_tree_node *node1,
* const struct avl_tree_node *node2)
* const struct avl_tree_node *node2)
* {
* int n1 = GET_DATA(node1);
* int n2 = GET_DATA(node2);
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* int n1 = GET_DATA(node1);
* int n2 = GET_DATA(node2);
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* }
*
* bool insert_int(struct avl_tree_node **root_ptr, int data)
* {
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper));
* i->data = data;
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) {
* // Duplicate.
* free(i);
* return false;
* }
* return true;
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper));
* i->data = data;
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) {
* // Duplicate.
* free(i);
* return false;
* }
* return true;
* }
*/
static AVL_INLINE struct avl_tree_node *
avl_tree_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *item,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
struct avl_tree_node *item,
int (*cmp)(const struct avl_tree_node *, const struct avl_tree_node *))
{
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL;
int res;
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL;
int res;

while (*cur_ptr) {
cur = *cur_ptr;
res = (*cmp)(item, cur);
if (res < 0)
cur_ptr = &cur->left;
else if (res > 0)
cur_ptr = &cur->right;
else
return cur;
}
*cur_ptr = item;
item->parent_balance = (uintptr_t)cur | 1;
avl_tree_rebalance_after_insert(root_ptr, item);
return NULL;
while (*cur_ptr) {
cur = *cur_ptr;
res = (*cmp)(item, cur);
if (res < 0) {
cur_ptr = &cur->left;
} else if (res > 0) {
cur_ptr = &cur->right;
} else {
return cur;
}
}
*cur_ptr = item;
item->parent_balance = (uintptr_t) cur | 1;
avl_tree_rebalance_after_insert(root_ptr, item);
return NULL;
}

/* Removes an item from the specified AVL tree.
* See implementation for details. */
extern void
avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node);
extern void avl_tree_remove(struct avl_tree_node **root_ptr,
struct avl_tree_node *node);

/* Nonrecursive AVL tree traversal functions */

extern struct avl_tree_node *
avl_tree_first_in_order(const struct avl_tree_node *root);
extern struct avl_tree_node *avl_tree_first_in_order(const struct avl_tree_node *root);

extern struct avl_tree_node *
avl_tree_last_in_order(const struct avl_tree_node *root);
extern struct avl_tree_node *avl_tree_last_in_order(const struct avl_tree_node *root);

extern struct avl_tree_node *
avl_tree_next_in_order(const struct avl_tree_node *node);
extern struct avl_tree_node *avl_tree_next_in_order(const struct avl_tree_node *node);

extern struct avl_tree_node *
avl_tree_prev_in_order(const struct avl_tree_node *node);
extern struct avl_tree_node *avl_tree_prev_in_order(const struct avl_tree_node *node);

extern struct avl_tree_node *
avl_tree_first_in_postorder(const struct avl_tree_node *root);

extern struct avl_tree_node *
avl_tree_next_in_postorder(const struct avl_tree_node *prev,
const struct avl_tree_node *prev_parent);
const struct avl_tree_node *prev_parent);

/*
* Iterate through the nodes in an AVL tree in sorted order.
* You may not modify the tree during the iteration.
*
* @child_struct
* Variable that will receive a pointer to each struct inserted into the
* tree.
* Variable that will receive a pointer to each struct inserted into the
* tree.
* @root
* Root of the AVL tree.
* Root of the AVL tree.
* @struct_name
* Type of *child_struct.
* Type of *child_struct.
* @struct_member
* Member of @struct_name type that is the AVL tree node.
* Member of @struct_name type that is the AVL tree node.
*
* Example:
*
* struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* };
*
* void print_ints(struct avl_tree_node *root)
* {
* struct int_wrapper *i;
* struct int_wrapper *i;
*
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node)
* printf("%d\n", i->data);
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node)
* printf("%d\n", i->data);
* }
*/
#define avl_tree_for_each_in_order(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_first_in_order(root); \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1); \
_cur = avl_tree_next_in_order(_cur))
#define avl_tree_for_each_in_order(child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_first_in_order(root); \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1); \
_cur = avl_tree_next_in_order(_cur))

/*
* Like avl_tree_for_each_in_order(), but uses the reverse order.
*/
#define avl_tree_for_each_in_reverse_order(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_last_in_order(root); \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1); \
_cur = avl_tree_prev_in_order(_cur))
#define avl_tree_for_each_in_reverse_order( \
child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_last_in_order(root); \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1); \
_cur = avl_tree_prev_in_order(_cur))

/*
* Like avl_tree_for_each_in_order(), but iterates through the nodes in
* postorder, so the current node may be deleted or freed.
*/
#define avl_tree_for_each_in_postorder(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_first_in_postorder(root), *_parent; \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1) \
&& (_parent = avl_get_parent(_cur), 1); \
_cur = avl_tree_next_in_postorder(_cur, _parent))
#define avl_tree_for_each_in_postorder(child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_first_in_postorder(root), *_parent; \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1) && \
(_parent = avl_get_parent(_cur), 1); \
_cur = avl_tree_next_in_postorder(_cur, _parent))

#endif /* _AVL_TREE_H_ */

+ 34
- 33
src/mwparserfromhell/parser/ctokenizer/common.h View File

@@ -23,55 +23,56 @@ SOFTWARE.
#pragma once

#ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html
# define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html
#endif

#include <Python.h>
#include <structmember.h>
#include <bytesobject.h>
#include <structmember.h>

#include "avl_tree.h"

/* Compatibility macros */

#ifndef uint64_t
#define uint64_t unsigned PY_LONG_LONG
# define uint64_t unsigned PY_LONG_LONG
#endif

#define malloc PyObject_Malloc // XXX: yuck
#define malloc PyObject_Malloc // XXX: yuck
#define realloc PyObject_Realloc
#define free PyObject_Free

/* Unicode support macros */

#define PyUnicode_FROM_SINGLE(chr) \
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)

/* Error handling macros */

#define BAD_ROUTE self->route_state
#define BAD_ROUTE_CONTEXT self->route_context
#define FAIL_ROUTE(context) { \
self->route_state = 1; \
self->route_context = context; \
}
#define RESET_ROUTE() self->route_state = 0
#define BAD_ROUTE self->route_state
#define BAD_ROUTE_CONTEXT self->route_context
#define FAIL_ROUTE(context) \
do { \
self->route_state = 1; \
self->route_context = context; \
} while (0)
#define RESET_ROUTE() self->route_state = 0

/* Shared globals */

extern char** entitydefs;
extern char **entitydefs;

extern PyObject* NOARGS;
extern PyObject* definitions;
extern PyObject *NOARGS;
extern PyObject *definitions;

/* Structs */

typedef struct {
Py_ssize_t capacity;
Py_ssize_t length;
PyObject* object;
PyObject *object;
int kind;
void* data;
void *data;
} Textbuffer;

typedef struct {
@@ -80,19 +81,19 @@ typedef struct {
} StackIdent;

struct Stack {
PyObject* stack;
PyObject *stack;
uint64_t context;
Textbuffer* textbuffer;
Textbuffer *textbuffer;
StackIdent ident;
struct Stack* next;
struct Stack *next;
};
typedef struct Stack Stack;

typedef struct {
PyObject* object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */
int kind; /* object's kind value */
void* data; /* object's raw unicode buffer */
PyObject *object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */
int kind; /* object's kind value */
void *data; /* object's raw unicode buffer */
} TokenizerInput;

typedef struct avl_tree_node avl_tree;
@@ -104,13 +105,13 @@ typedef struct {

typedef struct {
PyObject_HEAD
TokenizerInput text; /* text to tokenize */
Stack* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
int global; /* global context */
int depth; /* stack recursion depth */
int route_state; /* whether a BadRoute has been triggered */
uint64_t route_context; /* context when the last BadRoute was triggered */
avl_tree* bad_routes; /* stack idents for routes known to fail */
int skip_style_tags; /* temp fix for the sometimes broken tag parser */
TokenizerInput text; /* text to tokenize */
Stack *topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
int global; /* global context */
int depth; /* stack recursion depth */
int route_state; /* whether a BadRoute has been triggered */
uint64_t route_context; /* context when the last BadRoute was triggered */
avl_tree *bad_routes; /* stack idents for routes known to fail */
int skip_style_tags; /* temp fix for the sometimes broken tag parser */
} Tokenizer;

+ 11
- 5
src/mwparserfromhell/parser/ctokenizer/contexts.h View File

@@ -89,11 +89,17 @@ SOFTWARE.

/* Aggregate contexts */

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)
#define AGG_FAIL \
(LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | \
LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE \
(LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | \
LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS \
(LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS \
(LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

/* Tag contexts */



+ 31
- 23
src/mwparserfromhell/parser/ctokenizer/definitions.c View File

@@ -27,7 +27,8 @@ SOFTWARE.
See the Python version for data sources.
*/

static const char* URI_SCHEMES[] = {
// clang-format off
static const char *URI_SCHEMES[] = {
"bitcoin",
"ftp",
"ftps",
@@ -55,10 +56,10 @@ static const char* URI_SCHEMES[] = {
"urn",
"worldwind",
"xmpp",
NULL,
NULL,
};

static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
static const char *URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
"bitcoin",
"geo",
"magnet",
@@ -73,7 +74,7 @@ static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
NULL,
};

static const char* PARSER_BLACKLIST[] = {
static const char *PARSER_BLACKLIST[] = {
"categorytree",
"ce",
"chem",
@@ -93,32 +94,32 @@ static const char* PARSER_BLACKLIST[] = {
"timeline",
NULL,
};
// clang-format on

static const char* SINGLE[] = {
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td",
"tr", NULL
};
static const char *SINGLE[] = {
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL};

static const char* SINGLE_ONLY[] = {
"br", "wbr", "hr", "meta", "link", "img", NULL
};
static const char *SINGLE_ONLY[] = {"br", "wbr", "hr", "meta", "link", "img", NULL};

/*
Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in
the second argument. The caller must free the return value when finished.
If the return value is NULL, the conversion failed and *string is not set.
*/
static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string)
static PyObject *
unicode_to_lcase_ascii(PyObject *input, const char **string)
{
PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes;

if (!lower)
if (!lower) {
return NULL;
}
bytes = PyUnicode_AsASCIIString(lower);
Py_DECREF(lower);
if (!bytes) {
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
PyErr_Clear();
}
return NULL;
}
*string = PyBytes_AS_STRING(bytes);
@@ -128,14 +129,16 @@ static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string)
/*
Return whether a PyUnicodeObject is in a list of lowercase ASCII strings.
*/
static int unicode_in_string_list(PyObject *input, const char **list)
static int
unicode_in_string_list(PyObject *input, const char **list)
{
const char *string;
PyObject *temp = unicode_to_lcase_ascii(input, &string);
int retval = 0;

if (!temp)
if (!temp) {
return 0;
}

while (*list) {
if (!strcmp(*(list++), string)) {
@@ -144,7 +147,7 @@ static int unicode_in_string_list(PyObject *input, const char **list)
}
}

end:
end:
Py_DECREF(temp);
return retval;
}
@@ -152,7 +155,8 @@ static int unicode_in_string_list(PyObject *input, const char **list)
/*
Return if the given tag's contents should be passed to the parser.
*/
int is_parsable(PyObject *tag)
int
is_parsable(PyObject *tag)
{
return !unicode_in_string_list(tag, PARSER_BLACKLIST);
}
@@ -160,7 +164,8 @@ int is_parsable(PyObject *tag)
/*
Return whether or not the given tag can exist without a close tag.
*/
int is_single(PyObject *tag)
int
is_single(PyObject *tag)
{
return unicode_in_string_list(tag, SINGLE);
}
@@ -168,7 +173,8 @@ int is_single(PyObject *tag)
/*
Return whether or not the given tag must exist without a close tag.
*/
int is_single_only(PyObject *tag)
int
is_single_only(PyObject *tag)
{
return unicode_in_string_list(tag, SINGLE_ONLY);
}
@@ -176,10 +182,12 @@ int is_single_only(PyObject *tag)
/*
Return whether the given scheme is valid for external links.
*/
int is_scheme(PyObject *scheme, int slashes)
int
is_scheme(PyObject *scheme, int slashes)
{
if (slashes)
if (slashes) {
return unicode_in_string_list(scheme, URI_SCHEMES);
else
} else {
return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL);
}
}

+ 5
- 6
src/mwparserfromhell/parser/ctokenizer/definitions.h View File

@@ -28,12 +28,11 @@ SOFTWARE.

/* Functions */

int is_parsable(PyObject*);
int is_single(PyObject*);
int is_single_only(PyObject*);
int is_scheme(PyObject*, int);
int is_parsable(PyObject *);
int is_single(PyObject *);
int is_single_only(PyObject *);
int is_scheme(PyObject *, int);

/* Macros */

#define GET_HTML_TAG(markup) \
(markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")

+ 20
- 14
src/mwparserfromhell/parser/ctokenizer/tag_data.c View File

@@ -26,13 +26,14 @@ SOFTWARE.
/*
Initialize a new TagData object.
*/
TagData* TagData_new(TokenizerInput* text)
TagData *
TagData_new(TokenizerInput *text)
{
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
}

TagData *self = malloc(sizeof(TagData));
@@ -54,25 +55,30 @@ TagData* TagData_new(TokenizerInput* text)
/*
Deallocate the given TagData object.
*/
void TagData_dealloc(TagData* self)
void
TagData_dealloc(TagData *self)
{
if (self->pad_first)
if (self->pad_first) {
Textbuffer_dealloc(self->pad_first);
if (self->pad_before_eq)
}
if (self->pad_before_eq) {
Textbuffer_dealloc(self->pad_before_eq);
if (self->pad_after_eq)
}
if (self->pad_after_eq) {
Textbuffer_dealloc(self->pad_after_eq);
}
free(self);
}

/*
Clear the internal buffers of the given TagData object.
*/
int TagData_reset_buffers(TagData* self)
int
TagData_reset_buffers(TagData *self)
{
if (Textbuffer_reset(self->pad_first) ||
Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq))
if (Textbuffer_reset(self->pad_first) || Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq)) {
return -1;
}
return 0;
}

+ 6
- 6
src/mwparserfromhell/parser/ctokenizer/tag_data.h View File

@@ -29,15 +29,15 @@ SOFTWARE.

typedef struct {
uint64_t context;
Textbuffer* pad_first;
Textbuffer* pad_before_eq;
Textbuffer* pad_after_eq;
Textbuffer *pad_first;
Textbuffer *pad_before_eq;
Textbuffer *pad_after_eq;
Py_UCS4 quoter;
Py_ssize_t reset;
} TagData;

/* Functions */

TagData* TagData_new(TokenizerInput*);
void TagData_dealloc(TagData*);
int TagData_reset_buffers(TagData*);
TagData *TagData_new(TokenizerInput *);
void TagData_dealloc(TagData *);
int TagData_reset_buffers(TagData *);

+ 45
- 26
src/mwparserfromhell/parser/ctokenizer/textbuffer.c View File

@@ -23,20 +23,22 @@ SOFTWARE.
#include "textbuffer.h"

#define INITIAL_CAPACITY 32
#define RESIZE_FACTOR 2
#define CONCAT_EXTRA 32
#define RESIZE_FACTOR 2
#define CONCAT_EXTRA 32

/*
Internal allocation function for textbuffers.
*/
static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar)
static int
internal_alloc(Textbuffer *self, Py_UCS4 maxchar)
{
self->capacity = INITIAL_CAPACITY;
self->length = 0;

self->object = PyUnicode_New(self->capacity, maxchar);
if (!self->object)
if (!self->object) {
return -1;
}
self->kind = PyUnicode_KIND(self->object);
self->data = PyUnicode_DATA(self->object);

@@ -46,7 +48,8 @@ static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar)
/*
Internal deallocation function for textbuffers.
*/
static void internal_dealloc(Textbuffer* self)
static void
internal_dealloc(Textbuffer *self)
{
Py_DECREF(self->object);
}
@@ -54,14 +57,16 @@ static void internal_dealloc(Textbuffer* self)
/*
Internal resize function.
*/
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
static int
internal_resize(Textbuffer *self, Py_ssize_t new_cap)
{
PyObject *newobj;
void *newdata;

newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
if (!newobj)
if (!newobj) {
return -1;
}
newdata = PyUnicode_DATA(newobj);
memcpy(newdata, self->data, self->length * self->kind);
Py_DECREF(self->object);
@@ -75,22 +80,25 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
/*
Create a new textbuffer object.
*/
Textbuffer* Textbuffer_new(TokenizerInput* text)
Textbuffer *
Textbuffer_new(TokenizerInput *text)
{
Textbuffer* self = malloc(sizeof(Textbuffer));
Textbuffer *self = malloc(sizeof(Textbuffer));
Py_UCS4 maxchar = 0;

maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);

if (!self)
if (!self) {
goto fail_nomem;
if (internal_alloc(self, maxchar) < 0)
}
if (internal_alloc(self, maxchar) < 0) {
goto fail_dealloc;
}
return self;

fail_dealloc:
fail_dealloc:
free(self);
fail_nomem:
fail_nomem:
PyErr_NoMemory();
return NULL;
}
@@ -98,7 +106,8 @@ Textbuffer* Textbuffer_new(TokenizerInput* text)
/*
Deallocate the given textbuffer.
*/
void Textbuffer_dealloc(Textbuffer* self)
void
Textbuffer_dealloc(Textbuffer *self)
{
internal_dealloc(self);
free(self);
@@ -107,26 +116,30 @@ void Textbuffer_dealloc(Textbuffer* self)
/*
Reset a textbuffer to its initial, empty state.
*/
int Textbuffer_reset(Textbuffer* self)
int
Textbuffer_reset(Textbuffer *self)
{
Py_UCS4 maxchar = 0;

maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);

internal_dealloc(self);
if (internal_alloc(self, maxchar))
if (internal_alloc(self, maxchar)) {
return -1;
}
return 0;
}

/*
Write a Unicode codepoint to the given textbuffer.
*/
int Textbuffer_write(Textbuffer* self, Py_UCS4 code)
int
Textbuffer_write(Textbuffer *self, Py_UCS4 code)
{
if (self->length >= self->capacity) {
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) {
return -1;
}
}

PyUnicode_WRITE(self->kind, self->data, self->length++, code);
@@ -139,7 +152,8 @@ int Textbuffer_write(Textbuffer* self, Py_UCS4 code)

This function does not check for bounds.
*/
Py_UCS4 Textbuffer_read(Textbuffer* self, Py_ssize_t index)
Py_UCS4
Textbuffer_read(Textbuffer *self, Py_ssize_t index)
{
return PyUnicode_READ(self->kind, self->data, index);
}
@@ -147,7 +161,8 @@ Py_UCS4 Textbuffer_read(Textbuffer* self, Py_ssize_t index)
/*
Return the contents of the textbuffer as a Python Unicode object.
*/
PyObject* Textbuffer_render(Textbuffer* self)
PyObject *
Textbuffer_render(Textbuffer *self)
{
return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
}
@@ -155,17 +170,20 @@ PyObject* Textbuffer_render(Textbuffer* self)
/*
Concatenate the 'other' textbuffer onto the end of the given textbuffer.
*/
int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
int
Textbuffer_concat(Textbuffer *self, Textbuffer *other)
{
Py_ssize_t newlen = self->length + other->length;

if (newlen > self->capacity) {
if (internal_resize(self, newlen + CONCAT_EXTRA) < 0)
if (internal_resize(self, newlen + CONCAT_EXTRA) < 0) {
return -1;
}
}

assert(self->kind == other->kind);
memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
memcpy(((Py_UCS1 *) self->data) + self->kind * self->length,
other->data,
other->length * other->kind);

self->length = newlen;
@@ -175,15 +193,16 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
/*
Reverse the contents of the given textbuffer.
*/
void Textbuffer_reverse(Textbuffer* self)
void
Textbuffer_reverse(Textbuffer *self)
{
Py_ssize_t i, end = self->length - 1;
Py_UCS4 tmp;

for (i = 0; i < self->length / 2; i++) {
tmp = PyUnicode_READ(self->kind, self->data, i);
PyUnicode_WRITE(self->kind, self->data, i,
PyUnicode_READ(self->kind, self->data, end - i));
PyUnicode_WRITE(
self->kind, self->data, i, PyUnicode_READ(self->kind, self->data, end - i));
PyUnicode_WRITE(self->kind, self->data, end - i, tmp);
}
}

+ 8
- 8
src/mwparserfromhell/parser/ctokenizer/textbuffer.h View File

@@ -26,11 +26,11 @@ SOFTWARE.

/* Functions */

Textbuffer* Textbuffer_new(TokenizerInput*);
void Textbuffer_dealloc(Textbuffer*);
int Textbuffer_reset(Textbuffer*);
int Textbuffer_write(Textbuffer*, Py_UCS4);
Py_UCS4 Textbuffer_read(Textbuffer*, Py_ssize_t);
PyObject* Textbuffer_render(Textbuffer*);
int Textbuffer_concat(Textbuffer*, Textbuffer*);
void Textbuffer_reverse(Textbuffer*);
Textbuffer *Textbuffer_new(TokenizerInput *);
void Textbuffer_dealloc(Textbuffer *);
int Textbuffer_reset(Textbuffer *);
int Textbuffer_write(Textbuffer *, Py_UCS4);
Py_UCS4 Textbuffer_read(Textbuffer *, Py_ssize_t);
PyObject *Textbuffer_render(Textbuffer *);
int Textbuffer_concat(Textbuffer *, Textbuffer *);
void Textbuffer_reverse(Textbuffer *);

+ 984
- 771
src/mwparserfromhell/parser/ctokenizer/tok_parse.c
File diff suppressed because it is too large
View File


+ 4
- 3
src/mwparserfromhell/parser/ctokenizer/tok_parse.h View File

@@ -25,11 +25,12 @@ SOFTWARE.
#include "common.h"

static const Py_UCS4 MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '!', '\n', '\0'};
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'',
'#', '*', ';', ':', '/', '-', '!', '\n', '\0',
};

#define NUM_MARKERS 19

/* Functions */

PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
PyObject *Tokenizer_parse(Tokenizer *, uint64_t, int);

+ 122
- 79
src/mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -27,9 +27,10 @@ SOFTWARE.
/*
Add a new token stack, context, and textbuffer to the list.
*/
int Tokenizer_push(Tokenizer* self, uint64_t context)
int
Tokenizer_push(Tokenizer *self, uint64_t context)
{
Stack* top = malloc(sizeof(Stack));
Stack *top = malloc(sizeof(Stack));

if (!top) {
PyErr_NoMemory();
@@ -38,8 +39,9 @@ int Tokenizer_push(Tokenizer* self, uint64_t context)
top->stack = PyList_New(0);
top->context = context;
top->textbuffer = Textbuffer_new(&self->text);
if (!top->textbuffer)
if (!top->textbuffer) {
return -1;
}
top->ident.head = self->head;
top->ident.context = context;
top->next = self->topstack;
@@ -51,16 +53,19 @@ int Tokenizer_push(Tokenizer* self, uint64_t context)
/*
Push the textbuffer onto the stack as a Text node and clear it.
*/
int Tokenizer_push_textbuffer(Tokenizer* self)
int
Tokenizer_push_textbuffer(Tokenizer *self)
{
PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer;
Textbuffer *buffer = self->topstack->textbuffer;

if (buffer->length == 0)
if (buffer->length == 0) {
return 0;
}
text = Textbuffer_render(buffer);
if (!text)
if (!text) {
return -1;
}
kwargs = PyDict_New();
if (!kwargs) {
Py_DECREF(text);
@@ -70,24 +75,27 @@ int Tokenizer_push_textbuffer(Tokenizer* self)
Py_DECREF(text);
token = PyObject_Call(Text, NOARGS, kwargs);
Py_DECREF(kwargs);
if (!token)
if (!token) {
return -1;
}
if (PyList_Append(self->topstack->stack, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Textbuffer_reset(buffer))
if (Textbuffer_reset(buffer)) {
return -1;
}
return 0;
}

/*
Pop and deallocate the top token stack/context/textbuffer.
*/
void Tokenizer_delete_top_of_stack(Tokenizer* self)
void
Tokenizer_delete_top_of_stack(Tokenizer *self)
{
Stack* top = self->topstack;
Stack *top = self->topstack;

Py_DECREF(top->stack);
Textbuffer_dealloc(top->textbuffer);
@@ -99,12 +107,14 @@ void Tokenizer_delete_top_of_stack(Tokenizer* self)
/*
Pop the current stack/context/textbuffer, returing the stack.
*/
PyObject* Tokenizer_pop(Tokenizer* self)
PyObject *
Tokenizer_pop(Tokenizer *self)
{
PyObject* stack;
PyObject *stack;

if (Tokenizer_push_textbuffer(self))
if (Tokenizer_push_textbuffer(self)) {
return NULL;
}
stack = self->topstack->stack;
Py_INCREF(stack);
Tokenizer_delete_top_of_stack(self);
@@ -115,13 +125,15 @@ PyObject* Tokenizer_pop(Tokenizer* self)
Pop the current stack/context/textbuffer, returing the stack. We will also
replace the underlying stack's context with the current stack's.
*/
PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
PyObject *
Tokenizer_pop_keeping_context(Tokenizer *self)
{
PyObject* stack;
PyObject *stack;
uint64_t context;

if (Tokenizer_push_textbuffer(self))
if (Tokenizer_push_textbuffer(self)) {
return NULL;
}
stack = self->topstack->stack;
Py_INCREF(stack);
context = self->topstack->context;
@@ -133,16 +145,18 @@ PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
/*
Compare two route_tree_nodes that are in their avl_tree_node forms.
*/
static int compare_nodes(
const struct avl_tree_node* na, const struct avl_tree_node* nb)
static int
compare_nodes(const struct avl_tree_node *na, const struct avl_tree_node *nb)
{
route_tree_node *a = avl_tree_entry(na, route_tree_node, node);
route_tree_node *b = avl_tree_entry(nb, route_tree_node, node);

if (a->id.head < b->id.head)
if (a->id.head < b->id.head) {
return -1;
if (a->id.head > b->id.head)
}
if (a->id.head > b->id.head) {
return 1;
}
return (a->id.context > b->id.context) - (a->id.context < b->id.context);
}

@@ -152,13 +166,15 @@ static int compare_nodes(
This will be noticed when calling Tokenizer_check_route with the same head
and context, and the route will be failed immediately.
*/
void Tokenizer_memoize_bad_route(Tokenizer *self)
void
Tokenizer_memoize_bad_route(Tokenizer *self)
{
route_tree_node *node = malloc(sizeof(route_tree_node));
if (node) {
node->id = self->topstack->ident;
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) {
free(node);
}
}
}

@@ -168,10 +184,11 @@ void Tokenizer_memoize_bad_route(Tokenizer *self)
ident of the failed stack so future parsing attempts down this route can be
stopped early.
*/
void* Tokenizer_fail_route(Tokenizer* self)
void *
Tokenizer_fail_route(Tokenizer *self)
{
uint64_t context = self->topstack->context;
PyObject* stack;
PyObject *stack;

Tokenizer_memoize_bad_route(self);
stack = Tokenizer_pop(self);
@@ -193,10 +210,11 @@ void* Tokenizer_fail_route(Tokenizer* self)
but this would introduce too much overhead in C tokenizer due to the need
to check for a bad route after every call to Tokenizer_push.)
*/
int Tokenizer_check_route(Tokenizer* self, uint64_t context)
int
Tokenizer_check_route(Tokenizer *self, uint64_t context)
{
StackIdent ident = {self->head, context};
struct avl_tree_node *node = (struct avl_tree_node*) (&ident + 1);
struct avl_tree_node *node = (struct avl_tree_node *) (&ident + 1);

if (avl_tree_lookup_node(self->bad_routes, node, compare_nodes)) {
FAIL_ROUTE(context);
@@ -209,7 +227,8 @@ int Tokenizer_check_route(Tokenizer* self, uint64_t context)
Free the tokenizer's bad route cache tree. Intended to be called by the
main tokenizer function after parsing is finished.
*/
void Tokenizer_free_bad_route_tree(Tokenizer *self)
void
Tokenizer_free_bad_route_tree(Tokenizer *self)
{
struct avl_tree_node *cur = avl_tree_first_in_postorder(self->bad_routes);
struct avl_tree_node *parent;
@@ -225,17 +244,20 @@ void Tokenizer_free_bad_route_tree(Tokenizer *self)
/*
Write a token to the current token stack.
*/
int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
int
Tokenizer_emit_token(Tokenizer *self, PyObject *token, int first)
{
PyObject* instance;
PyObject *instance;

if (Tokenizer_push_textbuffer(self))
if (Tokenizer_push_textbuffer(self)) {
return -1;
}
instance = PyObject_CallObject(token, NULL);
if (!instance)
if (!instance) {
return -1;
if (first ? PyList_Insert(self->topstack->stack, 0, instance) :
PyList_Append(self->topstack->stack, instance)) {
}
if (first ? PyList_Insert(self->topstack->stack, 0, instance)
: PyList_Append(self->topstack->stack, instance)) {
Py_DECREF(instance);
return -1;
}
@@ -247,10 +269,13 @@ int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
Write a token to the current token stack, with kwargs. Steals a reference
to kwargs.
*/
int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
PyObject* kwargs, int first)
int
Tokenizer_emit_token_kwargs(Tokenizer *self,
PyObject *token,
PyObject *kwargs,
int first)
{
PyObject* instance;
PyObject *instance;

if (Tokenizer_push_textbuffer(self)) {
Py_DECREF(kwargs);
@@ -261,8 +286,8 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
Py_DECREF(kwargs);
return -1;
}
if (first ? PyList_Insert(self->topstack->stack, 0, instance):
PyList_Append(self->topstack->stack, instance)) {
if (first ? PyList_Insert(self->topstack->stack, 0, instance)
: PyList_Append(self->topstack->stack, instance)) {
Py_DECREF(instance);
Py_DECREF(kwargs);
return -1;
@@ -275,7 +300,8 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
/*
Write a Unicode codepoint to the current textbuffer.
*/
int Tokenizer_emit_char(Tokenizer* self, Py_UCS4 code)
int
Tokenizer_emit_char(Tokenizer *self, Py_UCS4 code)
{
return Textbuffer_write(self->topstack->textbuffer, code);
}
@@ -283,13 +309,15 @@ int Tokenizer_emit_char(Tokenizer* self, Py_UCS4 code)
/*
Write a string of text to the current textbuffer.
*/
int Tokenizer_emit_text(Tokenizer* self, const char* text)
int
Tokenizer_emit_text(Tokenizer *self, const char *text)
{
int i = 0;

while (text[i]) {
if (Tokenizer_emit_char(self, text[i]))
if (Tokenizer_emit_char(self, text[i])) {
return -1;
}
i++;
}
return 0;
@@ -299,7 +327,8 @@ int Tokenizer_emit_text(Tokenizer* self, const char* text)
Write the contents of another textbuffer to the current textbuffer,
deallocating it in the process.
*/
int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
int
Tokenizer_emit_textbuffer(Tokenizer *self, Textbuffer *buffer)
{
int retval = Textbuffer_concat(self->topstack->textbuffer, buffer);
Textbuffer_dealloc(buffer);
@@ -309,55 +338,63 @@ int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
/*
Write a series of tokens to the current stack at once.
*/
int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
int
Tokenizer_emit_all(Tokenizer *self, PyObject *tokenlist)
{
int pushed = 0;
PyObject *stack, *token, *left, *right, *text;
Textbuffer* buffer;
Textbuffer *buffer;
Py_ssize_t size;

if (PyList_GET_SIZE(tokenlist) > 0) {
token = PyList_GET_ITEM(tokenlist, 0);
switch (PyObject_IsInstance(token, Text)) {
case 0:
case 0:
break;
case 1: {
pushed = 1;
buffer = self->topstack->textbuffer;
if (buffer->length == 0) {
break;
case 1: {
pushed = 1;
buffer = self->topstack->textbuffer;
if (buffer->length == 0)
break;
left = Textbuffer_render(buffer);
if (!left)
return -1;
right = PyObject_GetAttrString(token, "text");
if (!right)
return -1;
text = PyUnicode_Concat(left, right);
Py_DECREF(left);
Py_DECREF(right);
if (!text)
return -1;
if (PyObject_SetAttrString(token, "text", text)) {
Py_DECREF(text);
return -1;
}
}
left = Textbuffer_render(buffer);
if (!left) {
return -1;
}
right = PyObject_GetAttrString(token, "text");
if (!right) {
return -1;
}
text = PyUnicode_Concat(left, right);
Py_DECREF(left);
Py_DECREF(right);
if (!text) {
return -1;
}
if (PyObject_SetAttrString(token, "text", text)) {
Py_DECREF(text);
if (Textbuffer_reset(buffer))
return -1;
break;
return -1;
}
case -1:
Py_DECREF(text);
if (Textbuffer_reset(buffer)) {
return -1;
}
break;
}
case -1:
return -1;
}
}
if (!pushed) {
if (Tokenizer_push_textbuffer(self))
if (Tokenizer_push_textbuffer(self)) {
return -1;
}
}
stack = self->topstack->stack;
size = PyList_GET_SIZE(stack);
if (PyList_SetSlice(stack, size, size, tokenlist))
if (PyList_SetSlice(stack, size, size, tokenlist)) {
return -1;
}
return 0;
}

@@ -365,9 +402,10 @@ int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
Pop the current stack, write text, and then write the stack. 'text' is a
NULL-terminated array of chars.
*/
int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
int
Tokenizer_emit_text_then_stack(Tokenizer *self, const char *text)
{
PyObject* stack = Tokenizer_pop(self);
PyObject *stack = Tokenizer_pop(self);

if (Tokenizer_emit_text(self, text)) {
Py_DECREF(stack);
@@ -389,7 +427,8 @@ int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
/*
Internal function to read the codepoint at the given index from the input.
*/
static Py_UCS4 read_codepoint(TokenizerInput* text, Py_ssize_t index)
static Py_UCS4
read_codepoint(TokenizerInput *text, Py_ssize_t index)
{
return PyUnicode_READ(text->kind, text->data, index);
}
@@ -397,24 +436,28 @@ static Py_UCS4 read_codepoint(TokenizerInput* text, Py_ssize_t index)
/*
Read the value at a relative point in the wikicode, forwards.
*/
Py_UCS4 Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
Py_UCS4
Tokenizer_read(Tokenizer *self, Py_ssize_t delta)
{
Py_ssize_t index = self->head + delta;

if (index >= self->text.length)
if (index >= self->text.length) {
return '\0';
}
return read_codepoint(&self->text, index);
}

/*
Read the value at a relative point in the wikicode, backwards.
*/
Py_UCS4 Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
Py_UCS4
Tokenizer_read_backwards(Tokenizer *self, Py_ssize_t delta)
{
Py_ssize_t index;

if (delta > self->head)
if (delta > self->head) {
return '\0';
}
index = self->head - delta;
return read_codepoint(&self->text, index);
}

+ 28
- 31
src/mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -26,41 +26,38 @@ SOFTWARE.

/* Functions */

int Tokenizer_push(Tokenizer*, uint64_t);
int Tokenizer_push_textbuffer(Tokenizer*);
void Tokenizer_delete_top_of_stack(Tokenizer*);
PyObject* Tokenizer_pop(Tokenizer*);
PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
void Tokenizer_memoize_bad_route(Tokenizer*);
void* Tokenizer_fail_route(Tokenizer*);
int Tokenizer_check_route(Tokenizer*, uint64_t);
void Tokenizer_free_bad_route_tree(Tokenizer*);
int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
int Tokenizer_emit_char(Tokenizer*, Py_UCS4);
int Tokenizer_emit_text(Tokenizer*, const char*);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
int Tokenizer_emit_all(Tokenizer*, PyObject*);
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);
Py_UCS4 Tokenizer_read(Tokenizer*, Py_ssize_t);
Py_UCS4 Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
int Tokenizer_push(Tokenizer *, uint64_t);
int Tokenizer_push_textbuffer(Tokenizer *);
void Tokenizer_delete_top_of_stack(Tokenizer *);
PyObject *Tokenizer_pop(Tokenizer *);
PyObject *Tokenizer_pop_keeping_context(Tokenizer *);
void Tokenizer_memoize_bad_route(Tokenizer *);
void *Tokenizer_fail_route(Tokenizer *);
int Tokenizer_check_route(Tokenizer *, uint64_t);
void Tokenizer_free_bad_route_tree(Tokenizer *);
int Tokenizer_emit_token(Tokenizer *, PyObject *, int);
int Tokenizer_emit_token_kwargs(Tokenizer *, PyObject *, PyObject *, int);
int Tokenizer_emit_char(Tokenizer *, Py_UCS4);
int Tokenizer_emit_text(Tokenizer *, const char *);
int Tokenizer_emit_textbuffer(Tokenizer *, Textbuffer *);
int Tokenizer_emit_all(Tokenizer *, PyObject *);
int Tokenizer_emit_text_then_stack(Tokenizer *, const char *);
Py_UCS4 Tokenizer_read(Tokenizer *, Py_ssize_t);
Py_UCS4 Tokenizer_read_backwards(Tokenizer *, Py_ssize_t);

/* Macros */

#define MAX_DEPTH 40
#define Tokenizer_CAN_RECURSE(self) \
(self->depth < MAX_DEPTH)
#define Tokenizer_IS_CURRENT_STACK(self, id) \
(self->topstack->ident.head == (id).head && \
#define MAX_DEPTH 40
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH)
#define Tokenizer_IS_CURRENT_STACK(self, id) \
(self->topstack->ident.head == (id).head && \
self->topstack->ident.context == (id).context)

#define Tokenizer_emit(self, token) \
Tokenizer_emit_token(self, token, 0)
#define Tokenizer_emit_first(self, token) \
Tokenizer_emit_token(self, token, 1)
#define Tokenizer_emit_kwargs(self, token, kwargs) \
#define Tokenizer_emit(self, token) Tokenizer_emit_token(self, token, 0)
#define Tokenizer_emit_first(self, token) Tokenizer_emit_token(self, token, 1)
#define Tokenizer_emit_kwargs(self, token, kwargs) \
Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
#define Tokenizer_emit_first_kwargs(self, token, kwargs) \
#define Tokenizer_emit_first_kwargs(self, token, kwargs) \
Tokenizer_emit_token_kwargs(self, token, kwargs, 1)

+ 96
- 69
src/mwparserfromhell/parser/ctokenizer/tokenizer.c View File

@@ -30,12 +30,12 @@ SOFTWARE.
int route_state;
uint64_t route_context;

char** entitydefs;
char **entitydefs;

PyObject* NOARGS;
PyObject* definitions;
PyObject *NOARGS;
PyObject *definitions;

static PyObject* ParserError;
static PyObject *ParserError;

/* Forward declarations */

@@ -44,17 +44,18 @@ static int load_exceptions(void);
/*
Create a new tokenizer object.
*/
static PyObject*
Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
static PyObject *
Tokenizer_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
return (PyObject*) self;
Tokenizer *self = (Tokenizer *) type->tp_alloc(type, 0);
return (PyObject *) self;
}

/*
Deallocate the given tokenizer's text field.
*/
static void dealloc_tokenizer_text(TokenizerInput* text)
static void
dealloc_tokenizer_text(TokenizerInput *text)
{
Py_XDECREF(text->object);
}
@@ -62,7 +63,8 @@ static void dealloc_tokenizer_text(TokenizerInput* text)
/*
Deallocate the given tokenizer object.
*/
static void Tokenizer_dealloc(Tokenizer* self)
static void
Tokenizer_dealloc(Tokenizer *self)
{
Stack *this = self->topstack, *next;
dealloc_tokenizer_text(&self->text);
@@ -74,13 +76,14 @@ static void Tokenizer_dealloc(Tokenizer* self)
free(this);
this = next;
}
Py_TYPE(self)->tp_free((PyObject*) self);
Py_TYPE(self)->tp_free((PyObject *) self);
}

/*
Initialize a new tokenizer instance's text field.
*/
static void init_tokenizer_text(TokenizerInput* text)
static void
init_tokenizer_text(TokenizerInput *text)
{
text->object = Py_None;
Py_INCREF(Py_None);
@@ -92,12 +95,14 @@ static void init_tokenizer_text(TokenizerInput* text)
/*
Initialize a new tokenizer instance by setting instance attributes.
*/
static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
static int
Tokenizer_init(Tokenizer *self, PyObject *args, PyObject *kwds)
{
static char* kwlist[] = {NULL};
static char *kwlist[] = {NULL};

if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) {
return -1;
}
init_tokenizer_text(&self->text);
self->topstack = NULL;
self->head = self->global = self->depth = 0;
@@ -110,13 +115,15 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
/*
Load input text into the tokenizer.
*/
static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
static int
load_tokenizer_text(TokenizerInput *text, PyObject *input)
{
dealloc_tokenizer_text(text);
text->object = input;

if (PyUnicode_READY(input) < 0)
if (PyUnicode_READY(input) < 0) {
return -1;
}
text->kind = PyUnicode_KIND(input);
text->data = PyUnicode_DATA(input);
text->length = PyUnicode_GET_LENGTH(input);
@@ -126,30 +133,34 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
/*
Build a list of tokens from a string of wikicode and return it.
*/
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
static PyObject *
Tokenizer_tokenize(Tokenizer *self, PyObject *args)
{
PyObject *input, *tokens;
uint64_t context = 0;
unsigned long long context = 0;
int skip_style_tags = 0;

if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) {
if (PyArg_ParseTuple(args, "U|Kp", &input, &context, &skip_style_tags)) {
Py_INCREF(input);
if (load_tokenizer_text(&self->text, input))
if (load_tokenizer_text(&self->text, input)) {
return NULL;
}
else {
}
} else {
const char *encoded;
Py_ssize_t size;

/* Failed to parse a Unicode object; try a string instead. */
PyErr_Clear();
if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
&skip_style_tags))
if (!PyArg_ParseTuple(
args, "s#|Kp", &encoded, &size, &context, &skip_style_tags)) {
return NULL;
if (!(input = PyUnicode_FromStringAndSize(encoded, size)))
}
if (!(input = PyUnicode_FromStringAndSize(encoded, size))) {
return NULL;
if (load_tokenizer_text(&self->text, input))
}
if (load_tokenizer_text(&self->text, input)) {
return NULL;
}
}

self->head = self->global = self->depth = 0;
@@ -162,73 +173,83 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)

if (!tokens || self->topstack) {
Py_XDECREF(tokens);
if (PyErr_Occurred())
if (PyErr_Occurred()) {
return NULL;
if (!ParserError && load_exceptions() < 0)
}
if (!ParserError && load_exceptions() < 0) {
return NULL;
}
if (BAD_ROUTE) {
RESET_ROUTE();
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
}
else if (self->topstack)
} else if (self->topstack) {
PyErr_SetString(ParserError,
"C tokenizer exited with non-empty token stack");
else
} else {
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
}
return NULL;
}
return tokens;
}

static int load_entities(void)
static int
load_entities(void)
{
PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i;
PyObject *string;

tempmod = PyImport_ImportModule("html.entities");
if (!tempmod)
if (!tempmod) {
return -1;
}
defmap = PyObject_GetAttrString(tempmod, "entitydefs");
if (!defmap)
if (!defmap) {
return -1;
}
Py_DECREF(tempmod);
deflist = PyDict_Keys(defmap);
if (!deflist)
if (!deflist) {
return -1;
}
Py_DECREF(defmap);
numdefs = (unsigned) PyList_GET_SIZE(deflist);
entitydefs = calloc(numdefs + 1, sizeof(char*));
if (!entitydefs)
entitydefs = calloc(numdefs + 1, sizeof(char *));
if (!entitydefs) {
return -1;
}
for (i = 0; i < numdefs; i++) {
string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i));
if (!string)
if (!string) {
return -1;
}
entitydefs[i] = PyBytes_AsString(string);
if (!entitydefs[i])
if (!entitydefs[i]) {
return -1;
}
}
Py_DECREF(deflist);
return 0;
}

static int load_tokens(void)
static int
load_tokens(void)
{
PyObject *tempmod, *tokens,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = PyUnicode_FromString("tokens");
PyObject *tempmod, *tokens;
PyObject *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *modname = PyUnicode_FromString("tokens");
char *name = "mwparserfromhell.parser";

if (!fromlist || !modname)
if (!fromlist || !modname) {
return -1;
}
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
if (!tempmod) {
return -1;
}
tokens = PyObject_GetAttrString(tempmod, "tokens");
Py_DECREF(tempmod);
load_tokens_from_module(tokens);
@@ -236,43 +257,45 @@ static int load_tokens(void)
return 0;
}

static int load_defs(void)
static int
load_defs(void)
{
PyObject *tempmod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = PyUnicode_FromString("definitions");
PyObject *tempmod;
PyObject *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *modname = PyUnicode_FromString("definitions");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
if (!fromlist || !modname) {
return -1;
}
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
if (!tempmod) {
return -1;
}
definitions = PyObject_GetAttrString(tempmod, "definitions");
Py_DECREF(tempmod);
return 0;
}

static int load_exceptions(void)
static int
load_exceptions(void)
{
PyObject *tempmod, *parsermod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = PyUnicode_FromString("parser");
PyObject *tempmod, *parsermod;
PyObject *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *modname = PyUnicode_FromString("parser");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
if (!fromlist || !modname) {
return -1;
}
PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist);
if (!tempmod)
if (!tempmod) {
return -1;
}
parsermod = PyObject_GetAttrString(tempmod, "parser");
Py_DECREF(tempmod);
ParserError = PyObject_GetAttrString(parsermod, "ParserError");
@@ -280,22 +303,26 @@ static int load_exceptions(void)
return 0;
}

PyMODINIT_FUNC PyInit__tokenizer(void)
PyMODINIT_FUNC
PyInit__tokenizer(void)
{
PyObject *module;

TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0)
if (PyType_Ready(&TokenizerType) < 0) {
return NULL;
}
module = PyModule_Create(&module_def);
if (!module)
if (!module) {
return NULL;
}
Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject *) &TokenizerType);
Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
NOARGS = PyTuple_New(0);
if (!NOARGS || load_entities() || load_tokens() || load_defs())
if (!NOARGS || load_entities() || load_tokens() || load_defs()) {
return NULL;
}
return module;
}

+ 56
- 47
src/mwparserfromhell/parser/ctokenizer/tokenizer.h View File

@@ -27,67 +27,76 @@ SOFTWARE.

/* Functions */

static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);
static PyObject *Tokenizer_new(PyTypeObject *, PyObject *, PyObject *);
static void Tokenizer_dealloc(Tokenizer *);
static int Tokenizer_init(Tokenizer *, PyObject *, PyObject *);
static PyObject *Tokenizer_tokenize(Tokenizer *, PyObject *);

/* Structs */

static PyMethodDef Tokenizer_methods[] = {
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
"Build a list of tokens from a string of wikicode and return it."},
{NULL}
{
"tokenize",
(PyCFunction) Tokenizer_tokenize,
METH_VARARGS,
"Build a list of tokens from a string of wikicode and return it.",
},
{NULL},
};

static PyMemberDef Tokenizer_members[] = {
{NULL}
{NULL},
};

static PyTypeObject TokenizerType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_tokenizer.CTokenizer", /* tp_name */
sizeof(Tokenizer), /* tp_basicsize */
0, /* tp_itemsize */
(destructor) Tokenizer_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"Creates a list of tokens from a string of wikicode.", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
Tokenizer_methods, /* tp_methods */
Tokenizer_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc) Tokenizer_init, /* tp_init */
0, /* tp_alloc */
Tokenizer_new, /* tp_new */
PyVarObject_HEAD_INIT(NULL, 0) /* header */
"_tokenizer.CTokenizer", /* tp_name */
sizeof(Tokenizer), /* tp_basicsize */
0, /* tp_itemsize */
(destructor) Tokenizer_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"Creates a list of tokens from a string of wikicode.", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
Tokenizer_methods, /* tp_methods */
Tokenizer_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc) Tokenizer_init, /* tp_init */
0, /* tp_alloc */
Tokenizer_new, /* tp_new */
};

static PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenizer",
"Creates a list of tokens from a string of wikicode.",
-1, NULL, NULL, NULL, NULL, NULL
-1,
NULL,
NULL,
NULL,
NULL,
NULL,
};

+ 42
- 44
src/mwparserfromhell/parser/ctokenizer/tokens.c View File

@@ -24,56 +24,55 @@ SOFTWARE.

/* Globals */

PyObject* Text;
PyObject* TemplateOpen;
PyObject* TemplateParamSeparator;
PyObject* TemplateParamEquals;
PyObject* TemplateClose;
PyObject* ArgumentOpen;
PyObject* ArgumentSeparator;
PyObject* ArgumentClose;
PyObject* WikilinkOpen;
PyObject* WikilinkSeparator;
PyObject* WikilinkClose;
PyObject* ExternalLinkOpen;
PyObject* ExternalLinkSeparator;
PyObject* ExternalLinkClose;
PyObject* HTMLEntityStart;
PyObject* HTMLEntityNumeric;
PyObject* HTMLEntityHex;
PyObject* HTMLEntityEnd;
PyObject* HeadingStart;
PyObject* HeadingEnd;
PyObject* CommentStart;
PyObject* CommentEnd;
PyObject* TagOpenOpen;
PyObject* TagAttrStart;
PyObject* TagAttrEquals;
PyObject* TagAttrQuote;
PyObject* TagCloseOpen;
PyObject* TagCloseSelfclose;
PyObject* TagOpenClose;
PyObject* TagCloseClose;
PyObject *Text;
PyObject *TemplateOpen;
PyObject *TemplateParamSeparator;
PyObject *TemplateParamEquals;
PyObject *TemplateClose;
PyObject *ArgumentOpen;
PyObject *ArgumentSeparator;
PyObject *ArgumentClose;
PyObject *WikilinkOpen;
PyObject *WikilinkSeparator;
PyObject *WikilinkClose;
PyObject *ExternalLinkOpen;
PyObject *ExternalLinkSeparator;
PyObject *ExternalLinkClose;
PyObject *HTMLEntityStart;
PyObject *HTMLEntityNumeric;
PyObject *HTMLEntityHex;
PyObject *HTMLEntityEnd;
PyObject *HeadingStart;
PyObject *HeadingEnd;
PyObject *CommentStart;
PyObject *CommentEnd;
PyObject *TagOpenOpen;
PyObject *TagAttrStart;
PyObject *TagAttrEquals;
PyObject *TagAttrQuote;
PyObject *TagCloseOpen;
PyObject *TagCloseSelfclose;
PyObject *TagOpenClose;
PyObject *TagCloseClose;

/*
Load individual tokens into globals from the given Python module object.
*/
void load_tokens_from_module(PyObject* module)
void
load_tokens_from_module(PyObject *module)
{
Text = PyObject_GetAttrString(module, "Text");

TemplateOpen = PyObject_GetAttrString(module, "TemplateOpen");
TemplateParamSeparator = PyObject_GetAttrString(module,
"TemplateParamSeparator");
TemplateParamEquals = PyObject_GetAttrString(module,
"TemplateParamEquals");
TemplateParamSeparator = PyObject_GetAttrString(module, "TemplateParamSeparator");
TemplateParamEquals = PyObject_GetAttrString(module, "TemplateParamEquals");
TemplateClose = PyObject_GetAttrString(module, "TemplateClose");

ArgumentOpen = PyObject_GetAttrString(module, "ArgumentOpen");
@@ -85,8 +84,7 @@ void load_tokens_from_module(PyObject* module)
WikilinkClose = PyObject_GetAttrString(module, "WikilinkClose");

ExternalLinkOpen = PyObject_GetAttrString(module, "ExternalLinkOpen");
ExternalLinkSeparator = PyObject_GetAttrString(module,
"ExternalLinkSeparator");
ExternalLinkSeparator = PyObject_GetAttrString(module, "ExternalLinkSeparator");
ExternalLinkClose = PyObject_GetAttrString(module, "ExternalLinkClose");

HTMLEntityStart = PyObject_GetAttrString(module, "HTMLEntityStart");


+ 38
- 38
src/mwparserfromhell/parser/ctokenizer/tokens.h View File

@@ -26,44 +26,44 @@ SOFTWARE.

/* Token globals */

extern PyObject* Text;
extern PyObject* TemplateOpen;
extern PyObject* TemplateParamSeparator;
extern PyObject* TemplateParamEquals;
extern PyObject* TemplateClose;
extern PyObject* ArgumentOpen;
extern PyObject* ArgumentSeparator;
extern PyObject* ArgumentClose;
extern PyObject* WikilinkOpen;
extern PyObject* WikilinkSeparator;
extern PyObject* WikilinkClose;
extern PyObject* ExternalLinkOpen;
extern PyObject* ExternalLinkSeparator;
extern PyObject* ExternalLinkClose;
extern PyObject* HTMLEntityStart;
extern PyObject* HTMLEntityNumeric;
extern PyObject* HTMLEntityHex;
extern PyObject* HTMLEntityEnd;
extern PyObject* HeadingStart;
extern PyObject* HeadingEnd;
extern PyObject* CommentStart;
extern PyObject* CommentEnd;
extern PyObject* TagOpenOpen;
extern PyObject* TagAttrStart;
extern PyObject* TagAttrEquals;
extern PyObject* TagAttrQuote;
extern PyObject* TagCloseOpen;
extern PyObject* TagCloseSelfclose;
extern PyObject* TagOpenClose;
extern PyObject* TagCloseClose;
extern PyObject *Text;
extern PyObject *TemplateOpen;
extern PyObject *TemplateParamSeparator;
extern PyObject *TemplateParamEquals;
extern PyObject *TemplateClose;
extern PyObject *ArgumentOpen;
extern PyObject *ArgumentSeparator;
extern PyObject *ArgumentClose;
extern PyObject *WikilinkOpen;
extern PyObject *WikilinkSeparator;
extern PyObject *WikilinkClose;
extern PyObject *ExternalLinkOpen;
extern PyObject *ExternalLinkSeparator;
extern PyObject *ExternalLinkClose;
extern PyObject *HTMLEntityStart;
extern PyObject *HTMLEntityNumeric;
extern PyObject *HTMLEntityHex;
extern PyObject *HTMLEntityEnd;
extern PyObject *HeadingStart;
extern PyObject *HeadingEnd;
extern PyObject *CommentStart;
extern PyObject *CommentEnd;
extern PyObject *TagOpenOpen;
extern PyObject *TagAttrStart;
extern PyObject *TagAttrEquals;
extern PyObject *TagAttrQuote;
extern PyObject *TagCloseOpen;
extern PyObject *TagCloseSelfclose;
extern PyObject *TagOpenClose;
extern PyObject *TagCloseClose;

/* Functions */

void load_tokens_from_module(PyObject*);
void load_tokens_from_module(PyObject *);

+ 2
- 0
src/mwparserfromhell/parser/errors.py View File

@@ -20,6 +20,7 @@

__all__ = ["ParserError"]


class ParserError(Exception):
"""Exception raised when an internal error occurs while parsing.

@@ -28,6 +29,7 @@ class ParserError(Exception):
with an impossible internal state and is bailing out before other problems
can happen. Its appearance indicates a bug.
"""

def __init__(self, extra):
msg = "This is a bug and should be reported. Info: {}.".format(extra)
super().__init__(msg)

+ 166
- 99
src/mwparserfromhell/parser/tokenizer.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -24,11 +24,17 @@ import re

from . import contexts, tokens
from .errors import ParserError
from ..definitions import (get_html_tag, is_parsable, is_single,
is_single_only, is_scheme)
from ..definitions import (
get_html_tag,
is_parsable,
is_single,
is_single_only,
is_scheme,
)

__all__ = ["Tokenizer"]


class BadRoute(Exception):
"""Raised internally when the current tokenization route is invalid."""

@@ -39,14 +45,15 @@ class BadRoute(Exception):

class _TagOpenData:
"""Stores data about an HTML open tag, like ``<ref name="foo">``."""
CX_NAME = 1 << 0
CX_ATTR_READY = 1 << 1
CX_ATTR_NAME = 1 << 2
CX_ATTR_VALUE = 1 << 3
CX_QUOTED = 1 << 4
CX_NOTE_SPACE = 1 << 5

CX_NAME = 1 << 0
CX_ATTR_READY = 1 << 1
CX_ATTR_NAME = 1 << 2
CX_ATTR_VALUE = 1 << 3
CX_QUOTED = 1 << 4
CX_NOTE_SPACE = 1 << 5
CX_NOTE_EQUALS = 1 << 6
CX_NOTE_QUOTE = 1 << 7
CX_NOTE_QUOTE = 1 << 7

def __init__(self):
self.context = self.CX_NAME
@@ -57,11 +64,34 @@ class _TagOpenData:

class Tokenizer:
"""Creates a list of tokens from a string of wikicode."""

USES_C = False
START = object()
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
":", "/", "-", "!", "\n", START, END]
MARKERS = [
"{",
"}",
"[",
"]",
"<",
">",
"|",
"=",
"&",
"'",
'"',
"#",
"*",
";",
":",
"/",
"-",
"!",
"\n",
START,
END,
]
URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
MAX_DEPTH = 40
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
tag_splitter = re.compile(r"([\s\"\'\\]+)")
@@ -323,7 +353,7 @@ class Tokenizer:
self._head += 2
try:
# If the wikilink looks like an external link, parse it as such:
link, _extra, _delta = self._really_parse_external_link(True)
link, _extra = self._really_parse_external_link(True)
except BadRoute:
self._head = reset + 1
try:
@@ -366,8 +396,7 @@ class Tokenizer:
self._emit_text("//")
self._head += 2
else:
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
all_valid = lambda: all(char in valid for char in self._read())
all_valid = lambda: all(char in self.URISCHEME for char in self._read())
scheme = ""
while self._read() is not self.END and all_valid():
scheme += self._read()
@@ -386,17 +415,16 @@ class Tokenizer:

def _parse_free_uri_scheme(self):
"""Parse the URI scheme of a free (no brackets) external link."""
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
scheme = []
try:
# We have to backtrack through the textbuffer looking for our
# scheme since it was just parsed as text:
for chunk in reversed(self._textbuffer):
for char in reversed(chunk):
# stop at the first non-word character
# Stop at the first non-word character
if re.fullmatch(r"\W", char):
raise StopIteration()
if char not in valid:
if char not in self.URISCHEME:
raise BadRoute()
scheme.append(char)
except StopIteration:
@@ -434,23 +462,26 @@ class Tokenizer:
self._emit_text(this)
return punct, tail

def _is_free_link_end(self, this, nxt):
"""Return whether the current head is the end of a free link."""
def _is_uri_end(self, this, nxt):
"""Return whether the current head is the end of a URI."""
# Built from _parse()'s end sentinels:
after, ctx = self._read(2), self._context
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or
this == nxt == "'" or
(this == "|" and ctx & contexts.TEMPLATE) or
(this == "=" and ctx & equal_sign_contexts) or
(this == nxt == "}" and ctx & contexts.TEMPLATE) or
(this == nxt == after == "}" and ctx & contexts.ARGUMENT))
return (
this in (self.END, "\n", "[", "]", "<", ">", '"')
or " " in this
or this == nxt == "'"
or (this == "|" and ctx & contexts.TEMPLATE)
or (this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING))
or (this == nxt == "}" and ctx & contexts.TEMPLATE)
or (this == nxt == after == "}" and ctx & contexts.ARGUMENT)
)

def _really_parse_external_link(self, brackets):
"""Really parse an external link."""
if brackets:
self._parse_bracketed_uri_scheme()
invalid = ("\n", " ", "]")
punct = ()
else:
self._parse_free_uri_scheme()
invalid = ("\n", " ", "[", "]")
@@ -465,53 +496,47 @@ class Tokenizer:
self._emit_text(tail)
tail = ""
self._parse_entity()
elif (this == "<" and nxt == "!" and self._read(2) ==
self._read(3) == "-"):
elif this == "<" and nxt == "!" and self._read(2) == self._read(3) == "-":
if tail:
self._emit_text(tail)
tail = ""
self._parse_comment()
elif not brackets and self._is_free_link_end(this, nxt):
return self._pop(), tail, -1
elif this is self.END or this == "\n":
self._fail_route()
elif this == nxt == "{" and self._can_recurse():
if tail:
self._emit_text(tail)
tail = ""
self._parse_template_or_argument()
elif this == "]":
return self._pop(), tail, 0
elif this == "'" and nxt == "'":
separator = tokens.ExternalLinkSeparator()
separator.suppress_space = True
self._emit(separator)
self._context ^= contexts.EXT_LINK_URI
self._context |= contexts.EXT_LINK_TITLE
return self._parse(push=False), None, 0
elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">",
"\"")):
before, after = re.split(r"[ \n[\]<>\"]", this, maxsplit=1)
delimiter = this[len(before)]
if brackets:
self._emit_text(before)
separator = tokens.ExternalLinkSeparator()
if delimiter != " ":
elif brackets:
if this is self.END or this == "\n":
self._fail_route()
if this == "]":
return self._pop(), None
if self._is_uri_end(this, nxt):
if " " in this:
before, after = this.split(" ", 1)
self._emit_text(before)
self._emit(tokens.ExternalLinkSeparator())
if after:
self._emit_text(after)
self._head += 1
else:
separator = tokens.ExternalLinkSeparator()
separator.suppress_space = True
self._emit(separator)
if after:
self._emit_text(after)
self._emit(separator)
self._context ^= contexts.EXT_LINK_URI
self._context |= contexts.EXT_LINK_TITLE
if delimiter == " ":
self._head += 1
return self._parse(push=False), None, 0
punct, tail = self._handle_free_link_text(punct, tail, before)
return self._pop(), tail + " " + after, 0
elif not brackets:
punct, tail = self._handle_free_link_text(punct, tail, this)
else:
return self._parse(push=False), None
self._emit_text(this)
else:
if self._is_uri_end(this, nxt):
if this is not self.END and " " in this:
before, after = this.split(" ", 1)
punct, tail = self._handle_free_link_text(punct, tail, before)
tail += " " + after
else:
self._head -= 1
return self._pop(), tail
punct, tail = self._handle_free_link_text(punct, tail, this)
self._head += 1

def _remove_uri_scheme_from_textbuffer(self, scheme):
@@ -536,7 +561,7 @@ class Tokenizer:
reset = self._head
self._head += 1
try:
link, extra, delta = self._really_parse_external_link(brackets)
link, extra = self._really_parse_external_link(brackets)
except BadRoute:
self._head = reset
if not brackets and self._context & contexts.DL_TERM:
@@ -550,7 +575,6 @@ class Tokenizer:
self._emit(tokens.ExternalLinkOpen(brackets=brackets))
self._emit_all(link)
self._emit(tokens.ExternalLinkClose())
self._head += delta
if extra:
self._emit_text(extra)

@@ -688,9 +712,13 @@ class Tokenizer:
self._emit_first(tokens.TagAttrQuote(char=data.quoter))
self._emit_all(self._pop())
buf = data.padding_buffer
self._emit_first(tokens.TagAttrStart(
pad_first=buf["first"], pad_before_eq=buf["before_eq"],
pad_after_eq=buf["after_eq"]))
self._emit_first(
tokens.TagAttrStart(
pad_first=buf["first"],
pad_before_eq=buf["before_eq"],
pad_after_eq=buf["after_eq"],
)
)
self._emit_all(self._pop())
for key in data.padding_buffer:
data.padding_buffer[key] = ""
@@ -698,7 +726,9 @@ class Tokenizer:
def _handle_tag_space(self, data, text):
"""Handle whitespace (*text*) inside of an HTML open tag."""
ctx = data.context
end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NOTE_QUOTE)
end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (
data.CX_QUOTED | data.CX_NOTE_QUOTE
)
if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE):
self._push_tag_buffer(data)
data.context = data.CX_ATTR_READY
@@ -799,8 +829,10 @@ class Tokenizer:
"""Handle the ending of a closing tag (``</foo>``)."""
strip = lambda tok: tok.text.rstrip().lower()
closing = self._pop()
if len(closing) != 1 or (not isinstance(closing[0], tokens.Text) or
strip(closing[0]) != strip(self._stack[1])):
if len(closing) != 1 or (
not isinstance(closing[0], tokens.Text)
or strip(closing[0]) != strip(self._stack[1])
):
self._fail_route()
self._emit_all(closing)
self._emit(tokens.TagCloseClose())
@@ -815,8 +847,9 @@ class Tokenizer:
self._fail_route()
elif this == "<" and nxt == "/":
self._head += 3
if self._read() != ">" or (strip(self._read(-1)) !=
strip(self._stack[1].text)):
if self._read() != ">" or (
strip(self._read(-1)) != strip(self._stack[1].text)
):
self._head -= 1
self._emit_text("</")
continue
@@ -854,8 +887,8 @@ class Tokenizer:
depth -= 1
if depth == 0: # pragma: no cover (untestable/exceptional)
raise ParserError(
"_handle_single_tag_end() got an unexpected "
"TagCloseSelfclose")
"_handle_single_tag_end() got an unexpected TagCloseSelfclose"
)
else: # pragma: no cover (untestable/exceptional case)
raise ParserError("_handle_single_tag_end() missed a TagCloseOpen")
padding = stack[index].padding
@@ -869,8 +902,10 @@ class Tokenizer:
self._emit(tokens.TagOpenOpen())
while True:
this, nxt = self._read(), self._read(1)
can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
data.context & data.CX_NOTE_SPACE)
can_exit = (
not data.context & (data.CX_QUOTED | data.CX_NAME)
or data.context & data.CX_NOTE_SPACE
)
if this is self.END:
if self._context & contexts.TAG_ATTR:
if data.context & data.CX_QUOTED:
@@ -1086,16 +1121,25 @@ class Tokenizer:
else:
self._emit_text("\n")

def _emit_table_tag(self, open_open_markup, tag, style, padding,
close_open_markup, contents, open_close_markup):
def _emit_table_tag(
self,
open_open_markup,
tag,
style,
padding,
close_open_markup,
contents,
open_close_markup,
):
"""Emit a table tag."""
self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup))
self._emit_text(tag)
if style:
self._emit_all(style)
if close_open_markup:
self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup,
padding=padding))
self._emit(
tokens.TagCloseOpen(wiki_markup=close_open_markup, padding=padding)
)
else:
self._emit(tokens.TagCloseOpen(padding=padding))
if contents:
@@ -1110,8 +1154,9 @@ class Tokenizer:
data.context = _TagOpenData.CX_ATTR_READY
while True:
this = self._read()
can_exit = (not data.context & data.CX_QUOTED or
data.context & data.CX_NOTE_SPACE)
can_exit = (
not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE
)
if this == end_token and can_exit:
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
self._push_tag_buffer(data)
@@ -1194,30 +1239,34 @@ class Tokenizer:
self._head -= 1
return

cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context | contexts.TABLE_CELL_STYLE)
cell = self._parse(
contexts.TABLE_OPEN
| contexts.TABLE_CELL_OPEN
| line_context
| contexts.TABLE_CELL_STYLE
)
cell_context = self._context
self._context = old_context
reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
if reset_for_style:
self._head = reset
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context)
padding = self._handle_table_style("|")
style = self._pop()
# Don't parse the style separator:
self._head += 1
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
cell = self._parse(
contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context
)
cell_context = self._context
self._context = old_context

close_open_markup = "|" if reset_for_style else None
self._emit_table_tag(markup, tag, style, padding, close_open_markup,
cell, "")
self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "")
# Keep header/cell line contexts:
self._context |= cell_context & (contexts.TABLE_TH_LINE |
contexts.TABLE_TD_LINE)
self._context |= cell_context & (
contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE
)
# Offset displacement done by parse():
self._head -= 1

@@ -1340,7 +1389,11 @@ class Tokenizer:
elif this == "|" and self._context & contexts.TEMPLATE:
self._handle_template_param()
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
if not self._global & contexts.GL_HEADING and self._read(-1) in ("\n", self.START) and nxt == "=":
if (
not self._global & contexts.GL_HEADING
and self._read(-1) in ("\n", self.START)
and nxt == "="
):
self._parse_heading()
else:
self._handle_template_param_value()
@@ -1369,7 +1422,11 @@ class Tokenizer:
self._parse_external_link(False)
elif this == "]" and self._context & contexts.EXT_LINK_TITLE:
return self._pop()
elif this == "=" and not self._global & contexts.GL_HEADING and not self._context & contexts.TEMPLATE:
elif (
this == "="
and not self._global & contexts.GL_HEADING
and not self._context & contexts.TEMPLATE
):
if self._read(-1) in ("\n", self.START):
self._parse_heading()
else:
@@ -1404,7 +1461,8 @@ class Tokenizer:
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
self._handle_list()
elif self._read(-1) in ("\n", self.START) and (
this == nxt == self._read(2) == self._read(3) == "-"):
this == nxt == self._read(2) == self._read(3) == "-"
):
self._handle_hr()
elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term()
@@ -1412,9 +1470,17 @@ class Tokenizer:
# Kill potential table contexts
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
# Start of table parsing
elif this == "{" and nxt == "|" and (
self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
elif (
this == "{"
and nxt == "|"
and (
self._read(-1) in ("\n", self.START)
or (
self._read(-2) in ("\n", self.START)
and self._read(-1).isspace()
)
)
):
if self._can_recurse():
self._parse_table()
else:
@@ -1438,8 +1504,9 @@ class Tokenizer:
elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS:
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
self._emit_text(this)
elif (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
elif self._read(-1) in ("\n", self.START) or (
self._read(-2) in ("\n", self.START) and self._read(-1).isspace()
):
if this == "|" and nxt == "}":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()


+ 30
- 28
src/mwparserfromhell/parser/tokens.py View File

@@ -28,6 +28,7 @@ the :class`.Wikicode` tree by the :class:`.Builder`.

__all__ = ["Token"]


class Token(dict):
"""A token stores the semantic meaning of a unit of wikicode."""

@@ -61,43 +62,44 @@ def make(name):
__all__.append(name)
return type(name, (Token,), {})


Text = make("Text")

TemplateOpen = make("TemplateOpen") # {{
TemplateParamSeparator = make("TemplateParamSeparator") # |
TemplateParamEquals = make("TemplateParamEquals") # =
TemplateClose = make("TemplateClose") # }}
TemplateOpen = make("TemplateOpen") # {{
TemplateParamSeparator = make("TemplateParamSeparator") # |
TemplateParamEquals = make("TemplateParamEquals") # =
TemplateClose = make("TemplateClose") # }}

ArgumentOpen = make("ArgumentOpen") # {{{
ArgumentSeparator = make("ArgumentSeparator") # |
ArgumentClose = make("ArgumentClose") # }}}
ArgumentOpen = make("ArgumentOpen") # {{{
ArgumentSeparator = make("ArgumentSeparator") # |
ArgumentClose = make("ArgumentClose") # }}}

WikilinkOpen = make("WikilinkOpen") # [[
WikilinkSeparator = make("WikilinkSeparator") # |
WikilinkClose = make("WikilinkClose") # ]]
WikilinkOpen = make("WikilinkOpen") # [[
WikilinkSeparator = make("WikilinkSeparator") # |
WikilinkClose = make("WikilinkClose") # ]]

ExternalLinkOpen = make("ExternalLinkOpen") # [
ExternalLinkSeparator = make("ExternalLinkSeparator") #
ExternalLinkClose = make("ExternalLinkClose") # ]
ExternalLinkOpen = make("ExternalLinkOpen") # [
ExternalLinkSeparator = make("ExternalLinkSeparator") #
ExternalLinkClose = make("ExternalLinkClose") # ]

HTMLEntityStart = make("HTMLEntityStart") # &
HTMLEntityNumeric = make("HTMLEntityNumeric") # #
HTMLEntityHex = make("HTMLEntityHex") # x
HTMLEntityEnd = make("HTMLEntityEnd") # ;
HTMLEntityStart = make("HTMLEntityStart") # &
HTMLEntityNumeric = make("HTMLEntityNumeric") # #
HTMLEntityHex = make("HTMLEntityHex") # x
HTMLEntityEnd = make("HTMLEntityEnd") # ;

HeadingStart = make("HeadingStart") # =...
HeadingEnd = make("HeadingEnd") # =...
HeadingStart = make("HeadingStart") # =...
HeadingEnd = make("HeadingEnd") # =...

CommentStart = make("CommentStart") # <!--
CommentEnd = make("CommentEnd") # -->
CommentStart = make("CommentStart") # <!--
CommentEnd = make("CommentEnd") # -->

TagOpenOpen = make("TagOpenOpen") # <
TagOpenOpen = make("TagOpenOpen") # <
TagAttrStart = make("TagAttrStart")
TagAttrEquals = make("TagAttrEquals") # =
TagAttrQuote = make("TagAttrQuote") # ", '
TagCloseOpen = make("TagCloseOpen") # >
TagCloseSelfclose = make("TagCloseSelfclose") # />
TagOpenClose = make("TagOpenClose") # </
TagCloseClose = make("TagCloseClose") # >
TagAttrEquals = make("TagAttrEquals") # =
TagAttrQuote = make("TagAttrQuote") # ", '
TagCloseOpen = make("TagCloseOpen") # >
TagCloseSelfclose = make("TagCloseSelfclose") # />
TagOpenClose = make("TagOpenClose") # </
TagCloseClose = make("TagCloseClose") # >

del make

+ 4
- 4
src/mwparserfromhell/smart_list/list_proxy.py View File

@@ -167,7 +167,7 @@ class ListProxy(_SliceNormalizerMixIn, list):

def _render(self):
"""Return the actual list from the stored start/stop/step."""
return list(self._parent)[self._start:self._stop:self._step]
return list(self._parent)[self._start : self._stop : self._step]

@inheritdoc
def append(self, item):
@@ -187,7 +187,7 @@ class ListProxy(_SliceNormalizerMixIn, list):

@inheritdoc
def extend(self, item):
self._parent[self._stop:self._stop] = item
self._parent[self._stop : self._stop] = item

@inheritdoc
def insert(self, index, item):
@@ -215,7 +215,7 @@ class ListProxy(_SliceNormalizerMixIn, list):
def reverse(self):
item = self._render()
item.reverse()
self._parent[self._start:self._stop:self._step] = item
self._parent[self._start : self._stop : self._step] = item

@inheritdoc
def sort(self, key=None, reverse=None):
@@ -226,4 +226,4 @@ class ListProxy(_SliceNormalizerMixIn, list):
if reverse is not None:
kwargs["reverse"] = reverse
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item
self._parent[self._start : self._stop : self._step] = item

+ 5
- 2
src/mwparserfromhell/string_mixin.py View File

@@ -27,6 +27,7 @@ from sys import getdefaultencoding

__all__ = ["StringMixIn"]


def inheritdoc(method):
"""Set __doc__ of *method* to __doc__ of *method* in its parent class.

@@ -36,6 +37,7 @@ def inheritdoc(method):
method.__doc__ = getattr(str, method.__name__).__doc__
return method


class StringMixIn:
"""Implement the interface for ``str`` in a dynamic manner.

@@ -92,8 +94,9 @@ class StringMixIn:

def __getattr__(self, attr):
if not hasattr(str, attr):
raise AttributeError("{!r} object has no attribute {!r}".format(
type(self).__name__, attr))
raise AttributeError(
"{!r} object has no attribute {!r}".format(type(self).__name__, attr)
)
return getattr(self.__str__(), attr)

maketrans = str.maketrans # Static method can't rely on __getattr__


+ 5
- 2
src/mwparserfromhell/utils.py View File

@@ -25,6 +25,7 @@ users generally won't need stuff from here.

__all__ = ["parse_anything"]


def parse_anything(value, context=0, skip_style_tags=False):
"""Return a :class:`.Wikicode` for *value*, allowing multiple types.

@@ -64,6 +65,8 @@ def parse_anything(value, context=0, skip_style_tags=False):
nodelist += parse_anything(item, context, skip_style_tags).nodes
return Wikicode(nodelist)
except TypeError as exc:
error = ("Needs string, Node, Wikicode, file, int, None, or "
"iterable of these, but got {0}: {1}")
error = (
"Needs string, Node, Wikicode, file, int, None, or "
"iterable of these, but got {0}: {1}"
)
raise ValueError(error.format(type(value).__name__, value)) from exc

+ 57
- 22
src/mwparserfromhell/wikicode.py View File

@@ -21,8 +21,18 @@
import re
from itertools import chain

from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity,
Node, Tag, Template, Text, Wikilink)
from .nodes import (
Argument,
Comment,
ExternalLink,
Heading,
HTMLEntity,
Node,
Tag,
Template,
Text,
Wikilink,
)
from .smart_list.list_proxy import ListProxy
from .string_mixin import StringMixIn
from .utils import parse_anything
@@ -31,6 +41,7 @@ __all__ = ["Wikicode"]

FLAGS = re.IGNORECASE | re.DOTALL | re.UNICODE


class Wikicode(StringMixIn):
"""A ``Wikicode`` is a container for nodes that operates like a string.

@@ -41,6 +52,7 @@ class Wikicode(StringMixIn):
<ifilter>` series of functions is very useful for extracting and iterating
over, for example, all of the templates in the object.
"""

RECURSE_OTHERS = 2

def __init__(self, nodes):
@@ -82,8 +94,9 @@ class Wikicode(StringMixIn):
return lambda obj: re.search(matches, str(obj), flags)
return lambda obj: True

def _indexed_ifilter(self, recursive=True, matches=None, flags=FLAGS,
forcetype=None):
def _indexed_ifilter(
self, recursive=True, matches=None, flags=FLAGS, forcetype=None
):
"""Iterate over nodes and their corresponding indices in the node list.

The arguments are interpreted as for :meth:`ifilter`. For each tuple
@@ -94,9 +107,11 @@ class Wikicode(StringMixIn):
match = self._build_matcher(matches, flags)
if recursive:
restrict = forcetype if recursive == self.RECURSE_OTHERS else None

def getter(i, node):
for ch in self._get_children(node, restrict=restrict):
yield (i, ch)

inodes = chain(*(getter(i, n) for i, n in enumerate(self.nodes)))
else:
inodes = enumerate(self.nodes)
@@ -106,6 +121,7 @@ class Wikicode(StringMixIn):

def _is_child_wikicode(self, obj, recursive=True):
"""Return whether the given :class:`.Wikicode` is a descendant."""

def deref(nodes):
if isinstance(nodes, ListProxy):
return nodes._parent # pylint: disable=protected-access
@@ -210,6 +226,7 @@ class Wikicode(StringMixIn):
should be any object that can be tested for with ``is``. *indent* is
the starting indentation.
"""

def write(*args):
"""Write a new line following the proper indentation rules."""
if lines and lines[-1] is marker: # Continue from the last line
@@ -243,10 +260,12 @@ class Wikicode(StringMixIn):
This is equivalent to :meth:`{1}` with *forcetype* set to
:class:`~{2.__module__}.{2.__name__}`.
"""
make_ifilter = lambda ftype: (lambda self, *a, **kw:
self.ifilter(forcetype=ftype, *a, **kw))
make_filter = lambda ftype: (lambda self, *a, **kw:
self.filter(forcetype=ftype, *a, **kw))
make_ifilter = lambda ftype: (
lambda self, *a, **kw: self.ifilter(forcetype=ftype, *a, **kw)
)
make_filter = lambda ftype: (
lambda self, *a, **kw: self.filter(forcetype=ftype, *a, **kw)
)
for name, ftype in meths.items():
ifilt = make_ifilter(ftype)
filt = make_filter(ftype)
@@ -342,6 +361,7 @@ class Wikicode(StringMixIn):
Will return an empty list if *obj* is at the top level of this Wikicode
object. Will raise :exc:`ValueError` if it wasn't found.
"""

def _get_ancestors(code, needle):
for node in code.nodes:
if node is needle:
@@ -510,8 +530,7 @@ class Wikicode(StringMixIn):
return True
return False

def ifilter(self, recursive=True, matches=None, flags=FLAGS,
forcetype=None):
def ifilter(self, recursive=True, matches=None, flags=FLAGS, forcetype=None):
"""Iterate over nodes in our list matching certain conditions.

If *forcetype* is given, only nodes that are instances of this type (or
@@ -545,8 +564,15 @@ class Wikicode(StringMixIn):
"""
return list(self.ifilter(*args, **kwargs))

def get_sections(self, levels=None, matches=None, flags=FLAGS, flat=False,
include_lead=None, include_headings=True):
def get_sections(
self,
levels=None,
matches=None,
flags=FLAGS,
flat=False,
include_lead=None,
include_headings=True,
):
"""Return a list of sections within the page.

Sections are returned as :class:`.Wikicode` objects with a shared node
@@ -568,12 +594,14 @@ class Wikicode(StringMixIn):
:class:`.Heading` object will be included; otherwise, this is skipped.
"""
title_matcher = self._build_matcher(matches, flags)
matcher = lambda heading: (title_matcher(heading.title) and
(not levels or heading.level in levels))
matcher = lambda heading: (
title_matcher(heading.title) and (not levels or heading.level in levels)
)
iheadings = self._indexed_ifilter(recursive=False, forcetype=Heading)
sections = [] # Tuples of (index_of_first_node, section)
open_headings = [] # Tuples of (index, heading), where index and
# heading.level are both monotonically increasing
# Tuples of (index, heading), where index and heading.level are both
# monotonically increasing
open_headings = []

# Add the lead section if appropriate:
if include_lead or not (include_lead is not None or matches or levels):
@@ -610,8 +638,7 @@ class Wikicode(StringMixIn):
# Ensure that earlier sections are earlier in the returned list:
return [section for i, section in sorted(sections)]

def strip_code(self, normalize=True, collapse=True,
keep_template_params=False):
def strip_code(self, normalize=True, collapse=True, keep_template_params=False):
"""Return a rendered string without unprintable code such as templates.

The way a node is stripped is handled by the
@@ -631,7 +658,7 @@ class Wikicode(StringMixIn):
kwargs = {
"normalize": normalize,
"collapse": collapse,
"keep_template_params": keep_template_params
"keep_template_params": keep_template_params,
}

nodes = []
@@ -673,7 +700,15 @@ class Wikicode(StringMixIn):
marker = object() # Random object we can find with certainty in a list
return "\n".join(self._get_tree(self, [], marker, 0))


Wikicode._build_filter_methods(
arguments=Argument, comments=Comment, external_links=ExternalLink,
headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template,
text=Text, wikilinks=Wikilink)
arguments=Argument,
comments=Comment,
external_links=ExternalLink,
headings=Heading,
html_entities=HTMLEntity,
tags=Tag,
templates=Template,
text=Text,
wikilinks=Wikilink,
)

+ 22
- 2
tests/conftest.py View File

@@ -18,14 +18,24 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading,
HTMLEntity, Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes import (
Argument,
Comment,
ExternalLink,
Heading,
HTMLEntity,
Tag,
Template,
Text,
Wikilink,
)
from mwparserfromhell.smart_list import SmartList
from mwparserfromhell.wikicode import Wikicode

wrap = lambda L: Wikicode(SmartList(L))
wraptext = lambda *args: wrap([Text(t) for t in args])


def _assert_node_equal(expected, actual):
"""Assert that two Nodes have the same type and have the same data."""
registry = {
@@ -43,6 +53,7 @@ def _assert_node_equal(expected, actual):
assert type(expected) == type(actual)
registry[type(expected)](expected, actual)


def _assert_argument_node_equal(expected, actual):
"""Assert that two Argument nodes have the same data."""
assert_wikicode_equal(expected.name, actual.name)
@@ -51,10 +62,12 @@ def _assert_argument_node_equal(expected, actual):
else:
assert actual.default is None


def _assert_comment_node_equal(expected, actual):
"""Assert that two Comment nodes have the same data."""
assert expected.contents == actual.contents


def _assert_external_link_node_equal(expected, actual):
"""Assert that two ExternalLink nodes have the same data."""
assert_wikicode_equal(expected.url, actual.url)
@@ -65,11 +78,13 @@ def _assert_external_link_node_equal(expected, actual):
assert expected.brackets is actual.brackets
assert expected.suppress_space is actual.suppress_space


def _assert_heading_node_equal(expected, actual):
"""Assert that two Heading nodes have the same data."""
assert_wikicode_equal(expected.title, actual.title)
assert expected.level == actual.level


def _assert_html_entity_node_equal(expected, actual):
"""Assert that two HTMLEntity nodes have the same data."""
assert expected.value == actual.value
@@ -77,6 +92,7 @@ def _assert_html_entity_node_equal(expected, actual):
assert expected.hexadecimal is actual.hexadecimal
assert expected.hex_char == actual.hex_char


def _assert_tag_node_equal(expected, actual):
"""Assert that two Tag nodes have the same data."""
assert_wikicode_equal(expected.tag, actual.tag)
@@ -105,6 +121,7 @@ def _assert_tag_node_equal(expected, actual):
assert expected.padding == actual.padding
assert_wikicode_equal(expected.closing_tag, actual.closing_tag)


def _assert_template_node_equal(expected, actual):
"""Assert that two Template nodes have the same data."""
assert_wikicode_equal(expected.name, actual.name)
@@ -117,10 +134,12 @@ def _assert_template_node_equal(expected, actual):
assert_wikicode_equal(exp_param.value, act_param.value)
assert exp_param.showkey is act_param.showkey


def _assert_text_node_equal(expected, actual):
"""Assert that two Text nodes have the same data."""
assert expected.value == actual.value


def _assert_wikilink_node_equal(expected, actual):
"""Assert that two Wikilink nodes have the same data."""
assert_wikicode_equal(expected.title, actual.title)
@@ -129,6 +148,7 @@ def _assert_wikilink_node_equal(expected, actual):
else:
assert actual.text is None


def assert_wikicode_equal(expected, actual):
"""Assert that two Wikicode objects have the same data."""
assert isinstance(actual, Wikicode)


+ 16
- 2
tests/test_argument.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes import Argument, Text
from .conftest import assert_wikicode_equal, wrap, wraptext


def test_str():
"""test Argument.__str__()"""
node = Argument(wraptext("foobar"))
@@ -34,6 +35,7 @@ def test_str():
node2 = Argument(wraptext("foo"), wraptext("bar"))
assert "{{{foo|bar}}}" == str(node2)


def test_children():
"""test Argument.__children__()"""
node1 = Argument(wraptext("foobar"))
@@ -48,6 +50,7 @@ def test_children():
with pytest.raises(StopIteration):
next(gen2)


def test_strip():
"""test Argument.__strip__()"""
node1 = Argument(wraptext("foobar"))
@@ -55,6 +58,7 @@ def test_strip():
assert node1.__strip__() is None
assert "bar" == node2.__strip__()


def test_showtree():
"""test Argument.__showtree__()"""
output = []
@@ -66,10 +70,19 @@ def test_showtree():
node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark)
valid = [
"{{{", (getter, node1.name), "}}}", "{{{", (getter, node2.name),
" | ", marker, (getter, node2.default), "}}}"]
"{{{",
(getter, node1.name),
"}}}",
"{{{",
(getter, node2.name),
" | ",
marker,
(getter, node2.default),
"}}}",
]
assert valid == output


def test_name():
"""test getter/setter for the name attribute"""
name = wraptext("foobar")
@@ -82,6 +95,7 @@ def test_name():
assert_wikicode_equal(wraptext("héhehé"), node1.name)
assert_wikicode_equal(wraptext("héhehé"), node2.name)


def test_default():
"""test getter/setter for the default attribute"""
default = wraptext("baz")


+ 5
- 0
tests/test_attribute.py View File

@@ -28,6 +28,7 @@ from mwparserfromhell.nodes import Template
from mwparserfromhell.nodes.extras import Attribute
from .conftest import assert_wikicode_equal, wrap, wraptext


def test_str():
"""test Attribute.__str__()"""
node = Attribute(wraptext("foo"))
@@ -43,6 +44,7 @@ def test_str():
node6 = Attribute(wraptext("a"), wrap([]), None, " ", "", " ")
assert " a= " == str(node6)


def test_name():
"""test getter/setter for the name attribute"""
name = wraptext("id")
@@ -51,6 +53,7 @@ def test_name():
node.name = "{{id}}"
assert_wikicode_equal(wrap([Template(wraptext("id"))]), node.name)


def test_value():
"""test getter/setter for the value attribute"""
value = wraptext("foo")
@@ -74,6 +77,7 @@ def test_value():
assert_wikicode_equal(wraptext("fo\"o 'bar' b\"az"), node2.value)
assert '"' == node2.quotes


def test_quotes():
"""test getter/setter for the quotes attribute"""
node1 = Attribute(wraptext("id"), wraptext("foo"), None)
@@ -92,6 +96,7 @@ def test_quotes():
with pytest.raises(ValueError):
Attribute(wraptext("id"), wraptext("foo bar baz"), None)


def test_padding():
"""test getter/setter for the padding attributes"""
for pad in ["pad_first", "pad_before_eq", "pad_after_eq"]:


+ 737
- 326
tests/test_builder.py
File diff suppressed because it is too large
View File


+ 5
- 0
tests/test_comment.py View File

@@ -26,11 +26,13 @@ import pytest

from mwparserfromhell.nodes import Comment


def test_str():
"""test Comment.__str__()"""
node = Comment("foobar")
assert "<!--foobar-->" == str(node)


def test_children():
"""test Comment.__children__()"""
node = Comment("foobar")
@@ -38,11 +40,13 @@ def test_children():
with pytest.raises(StopIteration):
next(gen)


def test_strip():
"""test Comment.__strip__()"""
node = Comment("foobar")
assert node.__strip__() is None


def test_showtree():
"""test Comment.__showtree__()"""
output = []
@@ -50,6 +54,7 @@ def test_showtree():
node.__showtree__(output.append, None, None)
assert ["<!--foobar-->"] == output


def test_contents():
"""test getter/setter for the contents attribute"""
node = Comment("foobar")


+ 12
- 6
tests/test_docs.py View File

@@ -32,6 +32,7 @@ import pytest

import mwparserfromhell


def assert_print(value, output):
"""Assertion check that *value*, when printed, produces *output*."""
buff = StringIO()
@@ -39,6 +40,7 @@ def assert_print(value, output):
buff.seek(0)
assert output == buff.read()


def test_readme_1():
"""test a block of example code in the README"""
text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?"
@@ -52,6 +54,7 @@ def test_readme_1():
assert_print(template.get(1).value, "bar")
assert_print(template.get("eggs").value, "spam")


def test_readme_2():
"""test a block of example code in the README"""
text = "{{foo|{{bar}}={{baz|{{spam}}}}}}"
@@ -59,17 +62,19 @@ def test_readme_2():
res = "['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}']"
assert_print(temps, res)


def test_readme_3():
"""test a block of example code in the README"""
code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}")
assert_print(code.filter_templates(recursive=False),
"['{{foo|this {{includes a|template}}}}']")
assert_print(
code.filter_templates(recursive=False),
"['{{foo|this {{includes a|template}}}}']",
)
foo = code.filter_templates(recursive=False)[0]
assert_print(foo.get(1).value, "this {{includes a|template}}")
assert_print(foo.get(1).value.filter_templates()[0],
"{{includes a|template}}")
assert_print(foo.get(1).value.filter_templates()[0].get(1).value,
"template")
assert_print(foo.get(1).value.filter_templates()[0], "{{includes a|template}}")
assert_print(foo.get(1).value.filter_templates()[0].get(1).value, "template")


def test_readme_4():
"""test a block of example code in the README"""
@@ -90,6 +95,7 @@ def test_readme_4():
assert_print(text, res)
assert text == code


@pytest.mark.skipif("NOWEB" in os.environ, reason="web test disabled by environ var")
def test_readme_5():
"""test a block of example code in the README; includes a web call"""


+ 12
- 7
tests/test_external_link.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes import ExternalLink, Text
from .conftest import assert_wikicode_equal, wrap, wraptext


def test_str():
"""test ExternalLink.__str__()"""
node = ExternalLink(wraptext("http://example.com/"), brackets=False)
@@ -35,15 +36,16 @@ def test_str():
assert "[http://example.com/]" == str(node2)
node3 = ExternalLink(wraptext("http://example.com/"), wrap([]))
assert "[http://example.com/ ]" == str(node3)
node4 = ExternalLink(wraptext("http://example.com/"),
wraptext("Example Web Page"))
node4 = ExternalLink(wraptext("http://example.com/"), wraptext("Example Web Page"))
assert "[http://example.com/ Example Web Page]" == str(node4)


def test_children():
"""test ExternalLink.__children__()"""
node1 = ExternalLink(wraptext("http://example.com/"), brackets=False)
node2 = ExternalLink(wraptext("http://example.com/"),
wrap([Text("Example"), Text("Page")]))
node2 = ExternalLink(
wraptext("http://example.com/"), wrap([Text("Example"), Text("Page")])
)
gen1 = node1.__children__()
gen2 = node2.__children__()
assert node1.url == next(gen1)
@@ -54,6 +56,7 @@ def test_children():
with pytest.raises(StopIteration):
next(gen2)


def test_strip():
"""test ExternalLink.__strip__()"""
node1 = ExternalLink(wraptext("http://example.com"), brackets=False)
@@ -66,6 +69,7 @@ def test_strip():
assert node3.__strip__() is None
assert "Link" == node4.__strip__()


def test_showtree():
"""test ExternalLink.__showtree__()"""
output = []
@@ -76,11 +80,10 @@ def test_showtree():
node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link"))
node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark)
valid = [
(getter, node1.url), "[", (getter, node2.url),
(getter, node2.title), "]"]
valid = [(getter, node1.url), "[", (getter, node2.url), (getter, node2.title), "]"]
assert valid == output


def test_url():
"""test getter/setter for the url attribute"""
url = wraptext("http://example.com/")
@@ -93,6 +96,7 @@ def test_url():
assert_wikicode_equal(wraptext("mailto:héhehé@spam.com"), node1.url)
assert_wikicode_equal(wraptext("mailto:héhehé@spam.com"), node2.url)


def test_title():
"""test getter/setter for the title attribute"""
title = wraptext("Example!")
@@ -105,6 +109,7 @@ def test_title():
node2.title = "My Website"
assert_wikicode_equal(wraptext("My Website"), node2.title)


def test_brackets():
"""test getter/setter for the brackets attribute"""
node1 = ExternalLink(wraptext("http://example.com/"), brackets=False)


+ 7
- 2
tests/test_heading.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes import Heading, Text
from .conftest import assert_wikicode_equal, wrap, wraptext


def test_str():
"""test Heading.__str__()"""
node = Heading(wraptext("foobar"), 2)
@@ -34,6 +35,7 @@ def test_str():
node2 = Heading(wraptext(" zzz "), 5)
assert "===== zzz =====" == str(node2)


def test_children():
"""test Heading.__children__()"""
node = Heading(wrap([Text("foo"), Text("bar")]), 3)
@@ -42,11 +44,13 @@ def test_children():
with pytest.raises(StopIteration):
next(gen)


def test_strip():
"""test Heading.__strip__()"""
node = Heading(wraptext("foobar"), 3)
assert "foobar" == node.__strip__()


def test_showtree():
"""test Heading.__showtree__()"""
output = []
@@ -56,10 +60,10 @@ def test_showtree():
node2 = Heading(wraptext(" baz "), 4)
node1.__showtree__(output.append, get, None)
node2.__showtree__(output.append, get, None)
valid = ["===", (getter, node1.title), "===",
"====", (getter, node2.title), "===="]
valid = ["===", (getter, node1.title), "===", "====", (getter, node2.title), "===="]
assert valid == output


def test_title():
"""test getter/setter for the title attribute"""
title = wraptext("foobar")
@@ -68,6 +72,7 @@ def test_title():
node.title = "héhehé"
assert_wikicode_equal(wraptext("héhehé"), node.title)


def test_level():
"""test getter/setter for the level attribute"""
node = Heading(wraptext("foobar"), 3)


+ 9
- 0
tests/test_html_entity.py View File

@@ -26,6 +26,7 @@ import pytest

from mwparserfromhell.nodes import HTMLEntity


def test_str():
"""test HTMLEntity.__str__()"""
node1 = HTMLEntity("nbsp", named=True, hexadecimal=False)
@@ -37,6 +38,7 @@ def test_str():
assert "&#x6b;" == str(node3)
assert "&#X6C;" == str(node4)


def test_children():
"""test HTMLEntity.__children__()"""
node = HTMLEntity("nbsp", named=True, hexadecimal=False)
@@ -44,6 +46,7 @@ def test_children():
with pytest.raises(StopIteration):
next(gen)


def test_strip():
"""test HTMLEntity.__strip__()"""
node1 = HTMLEntity("nbsp", named=True, hexadecimal=False)
@@ -57,6 +60,7 @@ def test_strip():
assert "é" == node3.__strip__(normalize=True)
assert "&#xe9;" == node3.__strip__(normalize=False)


def test_showtree():
"""test HTMLEntity.__showtree__()"""
output = []
@@ -69,6 +73,7 @@ def test_showtree():
res = ["&nbsp;", "&#107;", "&#xe9;"]
assert res == output


def test_value():
"""test getter/setter for the value attribute"""
node1 = HTMLEntity("nbsp")
@@ -109,6 +114,7 @@ def test_value():
with pytest.raises(ValueError):
node1.__setattr__("value", "12FFFF")


def test_named():
"""test getter/setter for the named attribute"""
node1 = HTMLEntity("nbsp")
@@ -130,6 +136,7 @@ def test_named():
with pytest.raises(ValueError):
node3.__setattr__("named", True)


def test_hexadecimal():
"""test getter/setter for the hexadecimal attribute"""
node1 = HTMLEntity("nbsp")
@@ -147,6 +154,7 @@ def test_hexadecimal():
with pytest.raises(ValueError):
node1.__setattr__("hexadecimal", True)


def test_hex_char():
"""test getter/setter for the hex_char attribute"""
node1 = HTMLEntity("e9")
@@ -164,6 +172,7 @@ def test_hex_char():
with pytest.raises(ValueError):
node1.__setattr__("hex_char", True)


def test_normalize():
"""test getter/setter for the normalize attribute"""
node1 = HTMLEntity("nbsp")


+ 4
- 0
tests/test_parameter.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes.extras import Parameter
from .conftest import assert_wikicode_equal, wraptext


def test_str():
"""test Parameter.__str__()"""
node = Parameter(wraptext("1"), wraptext("foo"), showkey=False)
@@ -34,6 +35,7 @@ def test_str():
node2 = Parameter(wraptext("foo"), wraptext("bar"))
assert "foo=bar" == str(node2)


def test_name():
"""test getter/setter for the name attribute"""
name1 = wraptext("1")
@@ -47,6 +49,7 @@ def test_name():
assert_wikicode_equal(wraptext("héhehé"), node1.name)
assert_wikicode_equal(wraptext("héhehé"), node2.name)


def test_value():
"""test getter/setter for the value attribute"""
value = wraptext("bar")
@@ -55,6 +58,7 @@ def test_value():
node.value = "héhehé"
assert_wikicode_equal(wraptext("héhehé"), node.value)


def test_showkey():
"""test getter/setter for the showkey attribute"""
node1 = Parameter(wraptext("1"), wraptext("foo"), showkey=False)


+ 43
- 19
tests/test_parser.py View File

@@ -29,6 +29,7 @@ from mwparserfromhell.nodes import Tag, Template, Text, Wikilink
from mwparserfromhell.nodes.extras import Parameter
from .conftest import assert_wikicode_equal, wrap, wraptext


@pytest.fixture()
def pyparser():
"""make sure the correct tokenizer is used"""
@@ -38,37 +39,60 @@ def pyparser():
yield
parser.use_c = restore


def test_use_c(pyparser):
assert parser.Parser()._tokenizer.USES_C is False


def test_parsing(pyparser):
"""integration test for parsing overall"""
text = "this is text; {{this|is=a|template={{with|[[links]]|in}}it}}"
expected = wrap([
Text("this is text; "),
Template(wraptext("this"), [
Parameter(wraptext("is"), wraptext("a")),
Parameter(wraptext("template"), wrap([
Template(wraptext("with"), [
Parameter(wraptext("1"),
wrap([Wikilink(wraptext("links"))]),
showkey=False),
Parameter(wraptext("2"),
wraptext("in"), showkey=False)
]),
Text("it")
]))
])
])
expected = wrap(
[
Text("this is text; "),
Template(
wraptext("this"),
[
Parameter(wraptext("is"), wraptext("a")),
Parameter(
wraptext("template"),
wrap(
[
Template(
wraptext("with"),
[
Parameter(
wraptext("1"),
wrap([Wikilink(wraptext("links"))]),
showkey=False,
),
Parameter(
wraptext("2"), wraptext("in"), showkey=False
),
],
),
Text("it"),
]
),
),
],
),
]
)
actual = parser.Parser().parse(text)
assert_wikicode_equal(expected, actual)


def test_skip_style_tags(pyparser):
"""test Parser.parse(skip_style_tags=True)"""
text = "This is an example with ''italics''!"
a = wrap([Text("This is an example with "),
Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"),
Text("!")])
a = wrap(
[
Text("This is an example with "),
Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"),
Text("!"),
]
)
b = wraptext("This is an example with ''italics''!")

with_style = parser.Parser().parse(text, skip_style_tags=False)


+ 27
- 2
tests/test_smart_list.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.smart_list import SmartList
from mwparserfromhell.smart_list.list_proxy import ListProxy


def _test_get_set_del_item(builder):
"""Run tests on __get/set/delitem__ of a list built with *builder*."""
list1 = builder([0, 1, 2, 3, "one", "two"])
@@ -104,6 +105,7 @@ def _test_get_set_del_item(builder):
del list2[2:8:2]
assert [0, 1, 3, 5, 7, 8, 9] == list2


def _test_add_radd_iadd(builder):
"""Run tests on __r/i/add__ of a list built with *builder*."""
list1 = builder(range(5))
@@ -116,6 +118,7 @@ def _test_add_radd_iadd(builder):
list1 += ["foo", "bar", "baz"]
assert [0, 1, 2, 3, 4, "foo", "bar", "baz"] == list1


def _test_other_magic_methods(builder):
"""Run tests on other magic methods of a list built with *builder*."""
list1 = builder([0, 1, 2, 3, "one", "two"])
@@ -200,6 +203,7 @@ def _test_other_magic_methods(builder):
list4 *= 2
assert [0, 1, 2, 0, 1, 2] == list4


def _test_list_methods(builder):
"""Run tests on the public methods of a list built with *builder*."""
list1 = builder(range(5))
@@ -263,6 +267,7 @@ def _test_list_methods(builder):
list3.sort(key=lambda i: i[1], reverse=True)
assert [("b", 8), ("a", 5), ("c", 3), ("d", 2)] == list3


def _dispatch_test_for_children(meth):
"""Run a test method on various different types of children."""
meth(lambda L: SmartList(list(L))[:])
@@ -270,10 +275,20 @@ def _dispatch_test_for_children(meth):
meth(lambda L: SmartList(list(L) + [999])[:-1])
meth(lambda L: SmartList([101, 102] + list(L) + [201, 202])[2:-2])


def test_docs():
"""make sure the methods of SmartList/ListProxy have docstrings"""
methods = ["append", "count", "extend", "index", "insert", "pop",
"remove", "reverse", "sort"]
methods = [
"append",
"count",
"extend",
"index",
"insert",
"pop",
"remove",
"reverse",
"sort",
]
for meth in methods:
expected = getattr(list, meth).__doc__
smartlist_doc = getattr(SmartList, meth).__doc__
@@ -281,6 +296,7 @@ def test_docs():
assert expected == smartlist_doc
assert expected == listproxy_doc


def test_doctest():
"""make sure the test embedded in SmartList's docstring passes"""
parent = SmartList([0, 1, 2, 3])
@@ -291,38 +307,47 @@ def test_doctest():
assert [2, 3, 4] == child
assert [0, 1, 2, 3, 4] == parent


def test_parent_get_set_del():
"""make sure SmartList's getitem/setitem/delitem work"""
_test_get_set_del_item(SmartList)


def test_parent_add():
"""make sure SmartList's add/radd/iadd work"""
_test_add_radd_iadd(SmartList)


def test_parent_other_magics():
"""make sure SmartList's other magically implemented features work"""
_test_other_magic_methods(SmartList)


def test_parent_methods():
"""make sure SmartList's non-magic methods work, like append()"""
_test_list_methods(SmartList)


def test_child_get_set_del():
"""make sure ListProxy's getitem/setitem/delitem work"""
_dispatch_test_for_children(_test_get_set_del_item)


def test_child_add():
"""make sure ListProxy's add/radd/iadd work"""
_dispatch_test_for_children(_test_add_radd_iadd)


def test_child_other_magics():
"""make sure ListProxy's other magically implemented features work"""
_dispatch_test_for_children(_test_other_magic_methods)


def test_child_methods():
"""make sure ListProxy's non-magic methods work, like append()"""
_dispatch_test_for_children(_test_list_methods)


def test_influence():
"""make sure changes are propagated from parents to children"""
parent = SmartList([0, 1, 2, 3, 4, 5])


+ 95
- 20
tests/test_string_mixin.py View File

@@ -29,6 +29,7 @@ import pytest

from mwparserfromhell.string_mixin import StringMixIn


class _FakeString(StringMixIn):
def __init__(self, data):
self._data = data
@@ -36,22 +37,63 @@ class _FakeString(StringMixIn):
def __str__(self):
return self._data

@pytest.mark.parametrize('method', [
"capitalize", "casefold", "center", "count", "encode", "endswith",
"expandtabs", "find", "format", "format_map", "index", "isalnum",
"isalpha", "isdecimal", "isdigit", "isidentifier", "islower",
"isnumeric", "isprintable", "isspace", "istitle", "isupper",
"join", "ljust", "lower", "lstrip", "maketrans", "partition",
"replace", "rfind", "rindex", "rjust", "rpartition", "rsplit",
"rstrip", "split", "splitlines", "startswith", "strip", "swapcase",
"title", "translate", "upper", "zfill"
])

@pytest.mark.parametrize(
"method",
[
"capitalize",
"casefold",
"center",
"count",
"encode",
"endswith",
"expandtabs",
"find",
"format",
"format_map",
"index",
"isalnum",
"isalpha",
"isdecimal",
"isdigit",
"isidentifier",
"islower",
"isnumeric",
"isprintable",
"isspace",
"istitle",
"isupper",
"join",
"ljust",
"lower",
"lstrip",
"maketrans",
"partition",
"replace",
"rfind",
"rindex",
"rjust",
"rpartition",
"rsplit",
"rstrip",
"split",
"splitlines",
"startswith",
"strip",
"swapcase",
"title",
"translate",
"upper",
"zfill",
],
)
def test_docs(method):
"""make sure the various methods of StringMixIn have docstrings"""
expected = getattr("foo", method).__doc__
actual = getattr(_FakeString("foo"), method).__doc__
assert expected == actual


def test_types():
"""make sure StringMixIns convert to different types correctly"""
fstr = _FakeString("fake string")
@@ -63,6 +105,7 @@ def test_types():
assert isinstance(bytes(fstr), bytes)
assert isinstance(repr(fstr), str)


def test_comparisons():
"""make sure comparison operators work"""
str1 = _FakeString("this is a fake string")
@@ -99,6 +142,7 @@ def test_comparisons():
assert str5 < str1
assert str5 <= str1


def test_other_magics():
"""test other magically implemented features, like len() and iter()"""
str1 = _FakeString("fake string")
@@ -154,6 +198,7 @@ def test_other_magics():
assert "real" not in str1
assert "s" not in str2


def test_other_methods():
"""test the remaining non-magic methods of StringMixIn"""
str1 = _FakeString("fake string")
@@ -354,8 +399,21 @@ def test_other_methods():
actual = ["this", "is", "a", "sentence", "with", "whitespace"]
assert actual == str25.rsplit()
assert actual == str25.rsplit(None)
actual = ["", "", "", "this", "is", "a", "", "", "sentence", "with",
"", "whitespace", ""]
actual = [
"",
"",
"",
"this",
"is",
"a",
"",
"",
"sentence",
"with",
"",
"whitespace",
"",
]
assert actual == str25.rsplit(" ")
actual = [" this is a", "sentence", "with", "whitespace"]
assert actual == str25.rsplit(None, 3)
@@ -371,8 +429,21 @@ def test_other_methods():
actual = ["this", "is", "a", "sentence", "with", "whitespace"]
assert actual == str25.split()
assert actual == str25.split(None)
actual = ["", "", "", "this", "is", "a", "", "", "sentence", "with",
"", "whitespace", ""]
actual = [
"",
"",
"",
"this",
"is",
"a",
"",
"",
"sentence",
"with",
"",
"whitespace",
"",
]
assert actual == str25.split(" ")
actual = ["this", "is", "a", "sentence with whitespace "]
assert actual == str25.split(None, 3)
@@ -382,10 +453,15 @@ def test_other_methods():
assert actual == str25.split(maxsplit=3)

str26 = _FakeString("lines\nof\ntext\r\nare\r\npresented\nhere")
assert ["lines", "of", "text", "are", "presented", "here"] \
== str26.splitlines()
assert ["lines\n", "of\n", "text\r\n", "are\r\n", "presented\n", "here"] \
== str26.splitlines(True)
assert ["lines", "of", "text", "are", "presented", "here"] == str26.splitlines()
assert [
"lines\n",
"of\n",
"text\r\n",
"are\r\n",
"presented\n",
"here",
] == str26.splitlines(True)

assert str1.startswith("fake") is True
assert str1.startswith("faker") is False
@@ -398,8 +474,7 @@ def test_other_methods():

assert "Fake String" == str1.title()

table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3",
111: "4", 117: "5"})
table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3", 111: "4", 117: "5"})
table2 = StringMixIn.maketrans("aeiou", "12345")
table3 = StringMixIn.maketrans("aeiou", "12345", "rts")
assert "f1k2 str3ng" == str1.translate(table1)


+ 99
- 40
tests/test_tag.py View File

@@ -34,21 +34,20 @@ agennq = lambda name, value: Attribute(wraptext(name), wraptext(value), None)
agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, '"', a, b, c)
agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, '"', a, b, c)


def test_str():
"""test Tag.__str__()"""
node1 = Tag(wraptext("ref"))
node2 = Tag(wraptext("span"), wraptext("foo"),
[agen("style", "color: red;")])
node3 = Tag(wraptext("ref"),
attrs=[agennq("name", "foo"),
agenpnv("some_attr", " ", "", "")],
self_closing=True)
node2 = Tag(wraptext("span"), wraptext("foo"), [agen("style", "color: red;")])
node3 = Tag(
wraptext("ref"),
attrs=[agennq("name", "foo"), agenpnv("some_attr", " ", "", "")],
self_closing=True,
)
node4 = Tag(wraptext("br"), self_closing=True, padding=" ")
node5 = Tag(wraptext("br"), self_closing=True, implicit=True)
node6 = Tag(wraptext("br"), self_closing=True, invalid=True,
implicit=True)
node7 = Tag(wraptext("br"), self_closing=True, invalid=True,
padding=" ")
node6 = Tag(wraptext("br"), self_closing=True, invalid=True, implicit=True)
node7 = Tag(wraptext("br"), self_closing=True, invalid=True, padding=" ")
node8 = Tag(wraptext("hr"), wiki_markup="----", self_closing=True)
node9 = Tag(wraptext("i"), wraptext("italics!"), wiki_markup="''")

@@ -62,6 +61,7 @@ def test_str():
assert "----" == str(node8)
assert "''italics!''" == str(node9)


def test_children():
"""test Tag.__children__()"""
# <ref>foobar</ref>
@@ -69,10 +69,12 @@ def test_children():
# '''bold text'''
node2 = Tag(wraptext("b"), wraptext("bold text"), wiki_markup="'''")
# <img id="foo" class="bar" selected />
node3 = Tag(wraptext("img"),
attrs=[agen("id", "foo"), agen("class", "bar"),
agennv("selected")],
self_closing=True, padding=" ")
node3 = Tag(
wraptext("img"),
attrs=[agen("id", "foo"), agen("class", "bar"), agennv("selected")],
self_closing=True,
padding=" ",
)

gen1 = node1.__children__()
gen2 = node2.__children__()
@@ -94,6 +96,7 @@ def test_children():
with pytest.raises(StopIteration):
next(gen3)


def test_strip():
"""test Tag.__strip__()"""
node1 = Tag(wraptext("i"), wraptext("foobar"))
@@ -104,28 +107,46 @@ def test_strip():
assert node2.__strip__() is None
assert node3.__strip__() is None


def test_showtree():
"""test Tag.__showtree__()"""
output = []
getter, marker = object(), object()
get = lambda code: output.append((getter, code))
mark = lambda: output.append(marker)
node1 = Tag(wraptext("ref"), wraptext("text"),
[agen("name", "foo"), agennv("selected")])
node1 = Tag(
wraptext("ref"), wraptext("text"), [agen("name", "foo"), agennv("selected")]
)
node2 = Tag(wraptext("br"), self_closing=True, padding=" ")
node3 = Tag(wraptext("br"), self_closing=True, invalid=True,
implicit=True, padding=" ")
node3 = Tag(
wraptext("br"), self_closing=True, invalid=True, implicit=True, padding=" "
)
node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark)
node3.__showtree__(output.append, get, mark)
valid = [
"<", (getter, node1.tag), (getter, node1.attributes[0].name),
" = ", marker, (getter, node1.attributes[0].value),
(getter, node1.attributes[1].name), ">", (getter, node1.contents),
"</", (getter, node1.closing_tag), ">", "<", (getter, node2.tag),
"/>", "</", (getter, node3.tag), ">"]
"<",
(getter, node1.tag),
(getter, node1.attributes[0].name),
" = ",
marker,
(getter, node1.attributes[0].value),
(getter, node1.attributes[1].name),
">",
(getter, node1.contents),
"</",
(getter, node1.closing_tag),
">",
"<",
(getter, node2.tag),
"/>",
"</",
(getter, node3.tag),
">",
]
assert valid == output


def test_tag():
"""test getter/setter for the tag attribute"""
tag = wraptext("ref")
@@ -137,6 +158,7 @@ def test_tag():
assert_wikicode_equal(wraptext("span"), node.closing_tag)
assert "<span>text</span>" == node


def test_contents():
"""test getter/setter for the contents attribute"""
contents = wraptext("text")
@@ -147,6 +169,7 @@ def test_contents():
assert_wikicode_equal(parsed, node.contents)
assert "<ref>text and a {{template}}</ref>" == node


def test_attributes():
"""test getter for the attributes attribute"""
attrs = [agen("name", "bar")]
@@ -155,6 +178,7 @@ def test_attributes():
assert [] == node1.attributes
assert attrs is node2.attributes


def test_wiki_markup():
"""test getter/setter for the wiki_markup attribute"""
node = Tag(wraptext("i"), wraptext("italic text"))
@@ -166,6 +190,7 @@ def test_wiki_markup():
assert node.wiki_markup is None
assert "<i>italic text</i>" == node


def test_self_closing():
"""test getter/setter for the self_closing attribute"""
node = Tag(wraptext("ref"), wraptext("foobar"))
@@ -177,6 +202,7 @@ def test_self_closing():
assert node.self_closing is False
assert "<ref>foobar</ref>" == node


def test_invalid():
"""test getter/setter for the invalid attribute"""
node = Tag(wraptext("br"), self_closing=True, implicit=True)
@@ -188,6 +214,7 @@ def test_invalid():
assert node.invalid is False
assert "<br>" == node


def test_implicit():
"""test getter/setter for the implicit attribute"""
node = Tag(wraptext("br"), self_closing=True)
@@ -199,6 +226,7 @@ def test_implicit():
assert node.implicit is False
assert "<br/>" == node


def test_padding():
"""test getter/setter for the padding attribute"""
node = Tag(wraptext("ref"), wraptext("foobar"))
@@ -212,6 +240,7 @@ def test_padding():
with pytest.raises(ValueError):
node.__setattr__("padding", True)


def test_closing_tag():
"""test getter/setter for the closing_tag attribute"""
tag = wraptext("ref")
@@ -222,6 +251,7 @@ def test_closing_tag():
assert_wikicode_equal(parsed, node.closing_tag)
assert "<ref>foobar</ref {{ignore me}}>" == node


def test_wiki_style_separator():
"""test getter/setter for wiki_style_separator attribute"""
node = Tag(wraptext("table"), wraptext("\n"))
@@ -233,6 +263,7 @@ def test_wiki_style_separator():
node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|")
assert "|" == node2.wiki_style_separator


def test_closing_wiki_markup():
"""test getter/setter for closing_wiki_markup attribute"""
node = Tag(wraptext("table"), wraptext("\n"))
@@ -248,12 +279,17 @@ def test_closing_wiki_markup():
node.wiki_markup = False
assert node.closing_wiki_markup is None
assert "<table>\n</table>" == node
node2 = Tag(wraptext("table"), wraptext("\n"),
attrs=[agen("id", "foo")], wiki_markup="{|",
closing_wiki_markup="|}")
node2 = Tag(
wraptext("table"),
wraptext("\n"),
attrs=[agen("id", "foo")],
wiki_markup="{|",
closing_wiki_markup="|}",
)
assert "|}" == node2.closing_wiki_markup
assert '{| id="foo"\n|}' == node2


def test_has():
"""test Tag.has()"""
node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")])
@@ -263,19 +299,26 @@ def test_has():
assert node.has("Name") is False
assert node.has("foo") is False

attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")]
attrs = [
agen("id", "foo"),
agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"),
agenpnv("foo", " ", " \n ", " \t"),
]
node2 = Tag(wraptext("div"), attrs=attrs, self_closing=True)
assert node2.has("id") is True
assert node2.has("class") is True
assert node2.has(attrs[1].pad_first + str(attrs[1].name) +
attrs[1].pad_before_eq) is True
assert (
node2.has(attrs[1].pad_first + str(attrs[1].name) + attrs[1].pad_before_eq)
is True
)
assert node2.has(attrs[3]) is True
assert node2.has(str(attrs[3])) is True
assert node2.has("idclass") is False
assert node2.has("id class") is False
assert node2.has("id=foo") is False


def test_get():
"""test Tag.get()"""
attrs = [agen("name", "foo")]
@@ -288,13 +331,18 @@ def test_get():
with pytest.raises(ValueError):
node.get("foo")

attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")]
attrs = [
agen("id", "foo"),
agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"),
agenpnv("foo", " ", " \n ", " \t"),
]
node2 = Tag(wraptext("div"), attrs=attrs, self_closing=True)
assert attrs[0] is node2.get("id")
assert attrs[1] is node2.get("class")
assert attrs[1] is node2.get(
attrs[1].pad_first + str(attrs[1].name) + attrs[1].pad_before_eq)
attrs[1].pad_first + str(attrs[1].name) + attrs[1].pad_before_eq
)
assert attrs[3] is node2.get(attrs[3])
assert attrs[3] is node2.get(str(attrs[3]))
assert attrs[3] is node2.get(" foo")
@@ -305,6 +353,7 @@ def test_get():
with pytest.raises(ValueError):
node2.get("id=foo")


def test_add():
"""test Tag.add()"""
node = Tag(wraptext("ref"), wraptext("cite"))
@@ -330,19 +379,29 @@ def test_add():
assert attr6 == node.attributes[5]
assert attr7 == node.attributes[6]
assert attr7 == node.get("name")
assert_wikicode_equal(wrap([Template(wraptext("foobar"))]),
node.attributes[5].value)
assert "".join(("<ref", attr1, attr2, attr3, attr4, attr5,
attr6, attr7, ">cite</ref>")) == node
assert_wikicode_equal(
wrap([Template(wraptext("foobar"))]), node.attributes[5].value
)
assert (
"".join(
("<ref", attr1, attr2, attr3, attr4, attr5, attr6, attr7, ">cite</ref>")
)
== node
)
with pytest.raises(ValueError):
node.add("name", "foo", quotes="bar")
with pytest.raises(ValueError):
node.add("name", "a bc d", quotes=None)


def test_remove():
"""test Tag.remove()"""
attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")]
attrs = [
agen("id", "foo"),
agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"),
agenpnv("foo", " ", " \n ", " \t"),
]
node = Tag(wraptext("div"), attrs=attrs, self_closing=True)
node.remove("class")
assert '<div id="foo" foo="bar" foo \n />' == node
@@ -351,4 +410,4 @@ def test_remove():
with pytest.raises(ValueError):
node.remove("foo")
node.remove("id")
assert '<div/>' == node
assert "<div/>" == node

+ 267
- 140
tests/test_template.py View File

@@ -34,19 +34,19 @@ from .conftest import assert_wikicode_equal, wrap, wraptext
pgens = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=True)
pgenh = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=False)


def test_str():
"""test Template.__str__()"""
node = Template(wraptext("foobar"))
assert "{{foobar}}" == str(node)
node2 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("abc", "def")])
node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("abc", "def")])
assert "{{foo|bar|abc=def}}" == str(node2)


def test_children():
"""test Template.__children__()"""
node2p1 = Parameter(wraptext("1"), wraptext("bar"), showkey=False)
node2p2 = Parameter(wraptext("abc"), wrap([Text("def"), Text("ghi")]),
showkey=True)
node2p2 = Parameter(wraptext("abc"), wrap([Text("def"), Text("ghi")]), showkey=True)
node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"), [node2p1, node2p2])

@@ -62,16 +62,23 @@ def test_children():
with pytest.raises(StopIteration):
next(gen2)


def test_strip():
"""test Template.__strip__()"""
node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"), [
pgenh("1", "bar"), pgens("foo", ""), pgens("abc", "def")])
node3 = Template(wraptext("foo"), [
pgenh("1", "foo"),
Parameter(wraptext("2"), wrap([Template(wraptext("hello"))]),
showkey=False),
pgenh("3", "bar")])
node2 = Template(
wraptext("foo"), [pgenh("1", "bar"), pgens("foo", ""), pgens("abc", "def")]
)
node3 = Template(
wraptext("foo"),
[
pgenh("1", "foo"),
Parameter(
wraptext("2"), wrap([Template(wraptext("hello"))]), showkey=False
),
pgenh("3", "bar"),
],
)

assert node1.__strip__(keep_template_params=False) is None
assert node2.__strip__(keep_template_params=False) is None
@@ -79,6 +86,7 @@ def test_strip():
assert "bar def" == node2.__strip__(keep_template_params=True)
assert "foo bar" == node3.__strip__(keep_template_params=True)


def test_showtree():
"""test Template.__showtree__()"""
output = []
@@ -86,18 +94,32 @@ def test_showtree():
get = lambda code: output.append((getter, code))
mark = lambda: output.append(marker)
node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("abc", "def")])
node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("abc", "def")])
node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark)
valid = [
"{{", (getter, node1.name), "}}", "{{", (getter, node2.name),
" | ", marker, (getter, node2.params[0].name), " = ", marker,
(getter, node2.params[0].value), " | ", marker,
(getter, node2.params[1].name), " = ", marker,
(getter, node2.params[1].value), "}}"]
"{{",
(getter, node1.name),
"}}",
"{{",
(getter, node2.name),
" | ",
marker,
(getter, node2.params[0].name),
" = ",
marker,
(getter, node2.params[0].value),
" | ",
marker,
(getter, node2.params[1].name),
" = ",
marker,
(getter, node2.params[1].value),
"}}",
]
assert valid == output


def test_name():
"""test getter/setter for the name attribute"""
name = wraptext("foobar")
@@ -110,6 +132,7 @@ def test_name():
assert_wikicode_equal(wraptext("asdf"), node1.name)
assert_wikicode_equal(wraptext("téstïng"), node2.name)


def test_params():
"""test getter for the params attribute"""
node1 = Template(wraptext("foobar"))
@@ -118,13 +141,14 @@ def test_params():
assert [] == node1.params
assert plist is node2.params


def test_has():
"""test Template.has()"""
node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("\nabc ", "def")])
node3 = Template(wraptext("foo"),
[pgenh("1", "a"), pgens("b", "c"), pgens("1", "d")])
node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("\nabc ", "def")])
node3 = Template(
wraptext("foo"), [pgenh("1", "a"), pgens("b", "c"), pgens("1", "d")]
)
node4 = Template(wraptext("foo"), [pgenh("1", "a"), pgens("b", " ")])
assert node1.has("foobar", False) is False
assert node2.has(1, False) is True
@@ -138,6 +162,7 @@ def test_has():
assert node1.has_param("foobar", False) is False
assert node2.has_param(1, False) is True


def test_get():
"""test Template.get()"""
node1 = Template(wraptext("foobar"))
@@ -159,16 +184,15 @@ def test_get():
assert node3p2 is node3.get("1")
assert node4p1 is node4.get("b ")


def test_add():
"""test Template.add()"""
node1 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node2 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node3 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node4 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node5 = Template(wraptext("a"), [pgens("b", "c"),
pgens(" d ", "e")])
node6 = Template(wraptext("a"), [pgens("b", "c"), pgens("b", "d"),
pgens("b", "e")])
node5 = Template(wraptext("a"), [pgens("b", "c"), pgens(" d ", "e")])
node6 = Template(wraptext("a"), [pgens("b", "c"), pgens("b", "d"), pgens("b", "e")])
node7 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node8p = pgenh("1", "d")
node8 = Template(wraptext("a"), [pgens("b", "c"), node8p])
@@ -176,48 +200,87 @@ def test_add():
node10 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "e")])
node11 = Template(wraptext("a"), [pgens("b", "c")])
node12 = Template(wraptext("a"), [pgens("b", "c")])
node13 = Template(wraptext("a"), [
pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")])
node14 = Template(wraptext("a\n"), [
pgens("b ", "c\n"), pgens("d ", " e"), pgens("f ", "g\n"),
pgens("h ", " i\n")])
node15 = Template(wraptext("a"), [
pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")])
node16 = Template(wraptext("a"), [
pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")])
node13 = Template(
wraptext("a"), [pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]
)
node14 = Template(
wraptext("a\n"),
[
pgens("b ", "c\n"),
pgens("d ", " e"),
pgens("f ", "g\n"),
pgens("h ", " i\n"),
],
)
node15 = Template(
wraptext("a"),
[pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")],
)
node16 = Template(
wraptext("a"), [pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]
)
node17 = Template(wraptext("a"), [pgenh("1", "b")])
node18 = Template(wraptext("a"), [pgenh("1", "b")])
node19 = Template(wraptext("a"), [pgenh("1", "b")])
node20 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"),
pgenh("3", "d"), pgenh("4", "e")])
node21 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"),
pgens("4", "d"), pgens("5", "e")])
node22 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"),
pgens("4", "d"), pgens("5", "e")])
node20 = Template(
wraptext("a"),
[pgenh("1", "b"), pgenh("2", "c"), pgenh("3", "d"), pgenh("4", "e")],
)
node21 = Template(
wraptext("a"),
[pgenh("1", "b"), pgenh("2", "c"), pgens("4", "d"), pgens("5", "e")],
)
node22 = Template(
wraptext("a"),
[pgenh("1", "b"), pgenh("2", "c"), pgens("4", "d"), pgens("5", "e")],
)
node23 = Template(wraptext("a"), [pgenh("1", "b")])
node24 = Template(wraptext("a"), [pgenh("1", "b")])
node25 = Template(wraptext("a"), [pgens("b", "c")])
node26 = Template(wraptext("a"), [pgenh("1", "b")])
node27 = Template(wraptext("a"), [pgenh("1", "b")])
node28 = Template(wraptext("a"), [pgens("1", "b")])
node29 = Template(wraptext("a"), [
pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")])
node30 = Template(wraptext("a\n"), [
pgens("b ", "c\n"), pgens("d ", " e"), pgens("f ", "g\n"),
pgens("h ", " i\n")])
node31 = Template(wraptext("a"), [
pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")])
node32 = Template(wraptext("a"), [
pgens("\nb ", " c "), pgens("\nd ", " e "), pgens("\nf ", " g ")])
node33 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"),
pgens("b", "f"), pgens("b", "h"),
pgens("i", "j")])
node34 = Template(wraptext("a"), [pgens("1", "b"), pgens("x", "y"),
pgens("1", "c"), pgens("2", "d")])
node35 = Template(wraptext("a"), [pgens("1", "b"), pgens("x", "y"),
pgenh("1", "c"), pgenh("2", "d")])
node36 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"),
pgens("f", "g")])
node29 = Template(
wraptext("a"), [pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]
)
node30 = Template(
wraptext("a\n"),
[
pgens("b ", "c\n"),
pgens("d ", " e"),
pgens("f ", "g\n"),
pgens("h ", " i\n"),
],
)
node31 = Template(
wraptext("a"),
[pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")],
)
node32 = Template(
wraptext("a"),
[pgens("\nb ", " c "), pgens("\nd ", " e "), pgens("\nf ", " g ")],
)
node33 = Template(
wraptext("a"),
[
pgens("b", "c"),
pgens("d", "e"),
pgens("b", "f"),
pgens("b", "h"),
pgens("i", "j"),
],
)
node34 = Template(
wraptext("a"),
[pgens("1", "b"), pgens("x", "y"), pgens("1", "c"), pgens("2", "d")],
)
node35 = Template(
wraptext("a"),
[pgens("1", "b"), pgens("x", "y"), pgenh("1", "c"), pgenh("2", "d")],
)
node36 = Template(
wraptext("a"), [pgens("b", "c"), pgens("d", "e"), pgens("f", "g")]
)
node37 = Template(wraptext("a"), [pgenh("1", "")])
node38 = Template(wraptext("abc"))
node39 = Template(wraptext("a"), [pgenh("1", " b ")])
@@ -320,65 +383,121 @@ def test_add():
assert "{{a|1= b|2= c|3= d}}" == node41
assert "{{a|b=hello \n}}" == node42


def test_remove():
"""test Template.remove()"""
node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("abc", "def")])
node3 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("abc", "def")])
node4 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgenh("2", "baz")])
node5 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node6 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node7 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgens(" 1", "b"), pgens("2", "c")])
node8 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgens(" 1", "b"), pgens("2", "c")])
node9 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node10 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node11 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node12 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node13 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node14 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node15 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node16 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node17 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node18 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node19 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node20 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node21 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node22 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node23 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node24 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node25 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node26 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("abc", "def")])
node3 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("abc", "def")])
node4 = Template(wraptext("foo"), [pgenh("1", "bar"), pgenh("2", "baz")])
node5 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node6 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node7 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgens(" 1", "b"), pgens("2", "c")]
)
node8 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgens(" 1", "b"), pgens("2", "c")]
)
node9 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node10 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node11 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node12 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node13 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node14 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node15 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node16 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node17 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node18 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node19 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node20 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node21 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node22 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node23 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node24 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node25 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node26 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node27 = Template(wraptext("foo"), [pgenh("1", "bar")])
node28 = Template(wraptext("foo"), [pgenh("1", "bar")])

@@ -444,12 +563,14 @@ def test_remove():
with pytest.raises(ValueError):
node27.remove(node28.get(1))


def test_formatting():
"""test realistic param manipulation with complex whitespace formatting
(assumes that parsing works correctly)"""
tests = [
# https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004
("""{{Infobox U.S. county
# https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004
(
"""{{Infobox U.S. county
| county = Lamar County
| state = Georgia
| seal =
@@ -471,16 +592,17 @@ def test_formatting():
| district = 3rd
| named for = [[Lucius Quintus Cincinnatus Lamar II]]
}}""",
"""@@ -11,4 +11,4 @@
"""@@ -11,4 +11,4 @@
| area percentage = 1.3%
-| census yr = 2010
-| pop = 18317
+| census estimate yr = 2016
+| pop = 12345<ref>example ref</ref>
| density_sq_mi = 100"""),

# https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760
("""{{Infobox U.S. County|
| density_sq_mi = 100""",
),
# https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760
(
"""{{Infobox U.S. County|
county = Rockdale County |
state = Georgia |
seal = |
@@ -500,16 +622,17 @@ def test_formatting():
| district = 4th
| time zone= Eastern
}}""",
"""@@ -11,4 +11,4 @@
"""@@ -11,4 +11,4 @@
area percentage = 1.7% |
- census yr = 2010|
- pop = 85215 |
+ census estimate yr = 2016 |
+ pop = 12345<ref>example ref</ref> |
density_sq_mi = 657 |"""),

# https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413
("""{{Infobox U.S. County|
density_sq_mi = 657 |""",
),
# https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413
(
"""{{Infobox U.S. County|
| county = Spalding County |
| state = Georgia |
| seal = |
@@ -530,16 +653,17 @@ def test_formatting():
| district = 3rd
| time zone = Eastern
}}""",
"""@@ -11,4 +11,4 @@
"""@@ -11,4 +11,4 @@
| area percentage = 1.6% |
-| census yr = 2010|
-| pop = 64073 |
+|
+| census estimate yr = 2016 | pop = 12345<ref>example ref</ref> |
| density_sq_mi = 326 |"""),

# https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648
("""{{Infobox U.S. county
| density_sq_mi = 326 |""",
),
# https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648
(
"""{{Infobox U.S. county
|county = Clinton County
|state = Illinois
| ex image = File:Clinton County Courthouse, Carlyle.jpg
@@ -560,16 +684,17 @@ def test_formatting():
|web = www.clintonco.illinois.gov
| district = 15th
}}""",
"""@@ -15,4 +15,4 @@
"""@@ -15,4 +15,4 @@
|area percentage = 5.8%
- |census yr = 2010
- |pop = 37762
+ |census estimate yr = 2016
+ |pop = 12345<ref>example ref</ref>
|density_sq_mi = 80"""),

# https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800
("""{{Infobox U.S. county |
|density_sq_mi = 80""",
),
# https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800
(
"""{{Infobox U.S. county |
county = Winnebago County |
state = Illinois |
seal = Winnebago County il seal.png |
@@ -590,19 +715,21 @@ def test_formatting():
| district = 16th
| district2 = 17th
}}""",
"""@@ -11,4 +11,4 @@
"""@@ -11,4 +11,4 @@
area percentage = 1.1% |
- census yr = 2010|
- pop = 295266 |
+ census estimate yr = 2016|
+ pop = 12345<ref>example ref</ref> |
density_sq_mi = 575""")]
density_sq_mi = 575""",
),
]

for (original, expected) in tests:
code = parse(original)
template = code.filter_templates()[0]
template.add("pop", "12345<ref>example ref</ref>")
template.add('census estimate yr', "2016", before="pop")
template.add("census estimate yr", "2016", before="pop")
template.remove("census yr")

oldlines = original.splitlines(True)


+ 5
- 0
tests/test_text.py View File

@@ -26,6 +26,7 @@ import pytest

from mwparserfromhell.nodes import Text


def test_str():
"""test Text.__str__()"""
node = Text("foobar")
@@ -33,6 +34,7 @@ def test_str():
node2 = Text("fóóbar")
assert "fóóbar" == str(node2)


def test_children():
"""test Text.__children__()"""
node = Text("foobar")
@@ -40,11 +42,13 @@ def test_children():
with pytest.raises(StopIteration):
next(gen)


def test_strip():
"""test Text.__strip__()"""
node = Text("foobar")
assert node is node.__strip__()


def test_showtree():
"""test Text.__showtree__()"""
output = []
@@ -57,6 +61,7 @@ def test_showtree():
res = ["foobar", r"f\xf3\xf3bar", "\\U00010332\\U0001033f\\U00010344"]
assert res == output


def test_value():
"""test getter/setter for the value attribute"""
node = Text("foobar")


+ 26
- 14
tests/test_tokenizer.py View File

@@ -33,29 +33,32 @@ try:
except ImportError:
CTokenizer = None


class _TestParseError(Exception):
"""Raised internally when a test could not be parsed."""


def _parse_test(test, data):
"""Parse an individual *test*, storing its info in *data*."""
for line in test.strip().splitlines():
if line.startswith("name:"):
data["name"] = line[len("name:"):].strip()
data["name"] = line[len("name:") :].strip()
elif line.startswith("label:"):
data["label"] = line[len("label:"):].strip()
data["label"] = line[len("label:") :].strip()
elif line.startswith("input:"):
raw = line[len("input:"):].strip()
raw = line[len("input:") :].strip()
if raw[0] == '"' and raw[-1] == '"':
raw = raw[1:-1]
raw = raw.encode("raw_unicode_escape")
data["input"] = raw.decode("unicode_escape")
elif line.startswith("output:"):
raw = line[len("output:"):].strip()
raw = line[len("output:") :].strip()
try:
data["output"] = eval(raw, vars(tokens))
except Exception as err:
raise _TestParseError(err) from err


def _load_tests(filename, name, text):
"""Load all tests in *text* from the file *filename*."""
tests = text.split("\n---\n")
@@ -77,15 +80,18 @@ def _load_tests(filename, name, text):
warnings.warn(error.format(filename))
continue
if data["input"] is None or data["output"] is None:
error = "Test '{}' in '{}' was ignored because it lacked an input or an output"
error = (
"Test '{}' in '{}' was ignored because it lacked an input or an output"
)
warnings.warn(error.format(data["name"], filename))
continue

# Include test filename in name
data['name'] = '{}:{}'.format(name, data['name'])
data["name"] = "{}:{}".format(name, data["name"])

yield data


def build():
"""Load and install all tests from the 'tokenizer' directory."""
directory = path.join(path.dirname(__file__), "tokenizer")
@@ -96,31 +102,37 @@ def build():
fullname = path.join(directory, filename)
with codecs.open(fullname, "r", encoding="utf8") as fp:
text = fp.read()
name = path.split(fullname)[1][:-len(extension)]
name = path.split(fullname)[1][: -len(extension)]
yield from _load_tests(fullname, name, text)

@pytest.mark.parametrize("tokenizer", filter(None, (
CTokenizer, PyTokenizer
)), ids=lambda t: 'CTokenizer' if t.USES_C else 'PyTokenizer')
@pytest.mark.parametrize("data", build(), ids=lambda data: data['name'])

@pytest.mark.parametrize(
"tokenizer",
filter(None, (CTokenizer, PyTokenizer)),
ids=lambda t: "CTokenizer" if t.USES_C else "PyTokenizer",
)
@pytest.mark.parametrize("data", build(), ids=lambda data: data["name"])
def test_tokenizer(tokenizer, data):
expected = data["output"]
actual = tokenizer().tokenize(data["input"])
assert expected == actual

@pytest.mark.parametrize("data", build(), ids=lambda data: data['name'])

@pytest.mark.parametrize("data", build(), ids=lambda data: data["name"])
def test_roundtrip(data):
expected = data["input"]
actual = str(Builder().build(data["output"][:]))
assert expected == actual

@pytest.mark.skipif(CTokenizer is None, reason='CTokenizer not available')

@pytest.mark.skipif(CTokenizer is None, reason="CTokenizer not available")
def test_c_tokenizer_uses_c():
"""make sure the C tokenizer identifies as using a C extension"""
assert CTokenizer.USES_C is True
assert CTokenizer().USES_C is True


def test_describe_context():
assert "" == contexts.describe(0)
ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY|contexts.HAS_TEXT)
ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY | contexts.HAS_TEXT)
assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx

+ 9
- 5
tests/test_tokens.py View File

@@ -26,6 +26,7 @@ import pytest

from mwparserfromhell.parser import tokens


@pytest.mark.parametrize("name", tokens.__all__)
def test_issubclass(name):
"""check that all classes within the tokens module are really Tokens"""
@@ -34,6 +35,7 @@ def test_issubclass(name):
assert isinstance(klass(), klass)
assert isinstance(klass(), tokens.Token)


def test_attributes():
"""check that Token attributes can be managed properly"""
token1 = tokens.Token()
@@ -54,6 +56,7 @@ def test_attributes():
with pytest.raises(KeyError):
token2.__delattr__("baz")


def test_repr():
"""check that repr() on a Token works as expected"""
token1 = tokens.Token()
@@ -65,6 +68,7 @@ def test_repr():
assert repr(token2) in ("Token(foo='bar', baz=123)", "Token(baz=123, foo='bar')")
assert "Text(text='" + hundredchars + "')" == repr(token3)


def test_equality():
"""check that equivalent tokens are considered equal"""
token1 = tokens.Token()
@@ -83,11 +87,11 @@ def test_equality():
assert token4 != token6
assert token5 != token6

@pytest.mark.parametrize("token", [
tokens.Token(),
tokens.Token(foo="bar", baz=123),
tokens.Text(text="earwig")
])
@pytest.mark.parametrize(
"token",
[tokens.Token(), tokens.Token(foo="bar", baz=123), tokens.Text(text="earwig")],
)
def test_repr_equality(token):
"""check that eval(repr(token)) == token"""
assert token == eval(repr(token), vars(tokens))

+ 23
- 18
tests/test_utils.py View File

@@ -28,28 +28,33 @@ from mwparserfromhell.nodes import Template, Text
from mwparserfromhell.utils import parse_anything
from .conftest import assert_wikicode_equal, wrap, wraptext

@pytest.mark.parametrize("test,valid", [
(wraptext("foobar"), wraptext("foobar")),
(Template(wraptext("spam")), wrap([Template(wraptext("spam"))])),
("fóóbar", wraptext("fóóbar")),
(b"foob\xc3\xa1r", wraptext("foobár")),
(123, wraptext("123")),
(True, wraptext("True")),
(None, wrap([])),
([Text("foo"), Text("bar"), Text("baz")],
wraptext("foo", "bar", "baz")),
([wraptext("foo"), Text("bar"), "baz", 123, 456],
wraptext("foo", "bar", "baz", "123", "456")),
([[[([[((("foo",),),)], "bar"],)]]], wraptext("foo", "bar"))
])

@pytest.mark.parametrize(
"test,valid",
[
(wraptext("foobar"), wraptext("foobar")),
(Template(wraptext("spam")), wrap([Template(wraptext("spam"))])),
("fóóbar", wraptext("fóóbar")),
(b"foob\xc3\xa1r", wraptext("foobár")),
(123, wraptext("123")),
(True, wraptext("True")),
(None, wrap([])),
([Text("foo"), Text("bar"), Text("baz")], wraptext("foo", "bar", "baz")),
(
[wraptext("foo"), Text("bar"), "baz", 123, 456],
wraptext("foo", "bar", "baz", "123", "456"),
),
([[[([[((("foo",),),)], "bar"],)]]], wraptext("foo", "bar")),
],
)
def test_parse_anything_valid(test, valid):
"""tests for valid input to utils.parse_anything()"""
assert_wikicode_equal(valid, parse_anything(test))

@pytest.mark.parametrize("invalid", [
Ellipsis, object, object(), type,
["foo", [object]]
])
@pytest.mark.parametrize(
"invalid", [Ellipsis, object, object(), type, ["foo", [object]]]
)
def test_parse_anything_invalid(invalid):
"""tests for invalid input to utils.parse_anything()"""
with pytest.raises(ValueError):


+ 140
- 67
tests/test_wikicode.py View File

@@ -34,6 +34,7 @@ from mwparserfromhell.wikicode import Wikicode
from mwparserfromhell import parse
from .conftest import wrap, wraptext


def test_str():
"""test Wikicode.__str__()"""
code1 = parse("foobar")
@@ -41,6 +42,7 @@ def test_str():
assert "foobar" == str(code1)
assert "Have a {{template}} and a [[page|link]]" == str(code2)


def test_nodes():
"""test getter/setter for the nodes attribute"""
code = parse("Have a {{template}}")
@@ -57,6 +59,7 @@ def test_nodes():
with pytest.raises(ValueError):
code.__setattr__("nodes", object)


def test_get():
"""test Wikicode.get()"""
code = parse("Have a {{template}} and a [[page|link]]")
@@ -65,6 +68,7 @@ def test_get():
with pytest.raises(IndexError):
code.get(4)


def test_set():
"""test Wikicode.set()"""
code = parse("Have a {{template}} and a [[page|link]]")
@@ -82,6 +86,7 @@ def test_set():
with pytest.raises(IndexError):
code.set(-4, "{{baz}}")


def test_contains():
"""test Wikicode.contains()"""
code = parse("Here is {{aaa|{{bbb|xyz{{ccc}}}}}} and a [[page|link]]")
@@ -93,6 +98,7 @@ def test_contains():
assert code.contains(str(tmpl4)) is True
assert code.contains(tmpl2.params[0].value) is True


def test_index():
"""test Wikicode.index()"""
code = parse("Have a {{template}} and a [[page|link]]")
@@ -105,13 +111,13 @@ def test_index():
code = parse("{{foo}}{{bar|{{baz}}}}")
assert 1 == code.index("{{bar|{{baz}}}}")
assert 1 == code.index("{{baz}}", recursive=True)
assert 1 == code.index(code.get(1).get(1).value,
recursive=True)
assert 1 == code.index(code.get(1).get(1).value, recursive=True)
with pytest.raises(ValueError):
code.index("{{baz}}", recursive=False)
with pytest.raises(ValueError):
code.index(code.get(1).get(1).value, recursive=False)


def test_get_ancestors_parent():
"""test Wikicode.get_ancestors() and Wikicode.get_parent()"""
code = parse("{{a|{{b|{{d|{{e}}{{f}}}}{{g}}}}}}{{c}}")
@@ -130,6 +136,7 @@ def test_get_ancestors_parent():
with pytest.raises(ValueError):
code.get_parent(fake)


def test_insert():
"""test Wikicode.insert()"""
code = parse("Have a {{template}} and a [[page|link]]")
@@ -144,14 +151,22 @@ def test_insert():
code2 = parse("{{foo}}{{bar}}{{baz}}")
code2.insert(1, "abc{{def}}ghi[[jk]]")
assert "{{foo}}abc{{def}}ghi[[jk]]{{bar}}{{baz}}" == code2
assert ["{{foo}}", "abc", "{{def}}", "ghi", "[[jk]]",
"{{bar}}", "{{baz}}"] == code2.nodes
assert [
"{{foo}}",
"abc",
"{{def}}",
"ghi",
"[[jk]]",
"{{bar}}",
"{{baz}}",
] == code2.nodes

code3 = parse("{{foo}}bar")
code3.insert(1000, "[[baz]]")
code3.insert(-1000, "derp")
assert "derp{{foo}}bar[[baz]]" == code3


def _test_search(meth, expected):
"""Base test for insert_before(), insert_after(), and replace()."""
code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}")
@@ -249,6 +264,7 @@ def _test_search(meth, expected):
meth(code9, code9.get_sections()[0], "{{quz}}")
assert expected[8] == code9


def test_insert_before():
"""test Wikicode.insert_before()"""
meth = lambda code, *args, **kw: code.insert_before(*args, **kw)
@@ -265,6 +281,7 @@ def test_insert_before():
]
_test_search(meth, expected)


def test_insert_after():
"""test Wikicode.insert_after()"""
meth = lambda code, *args, **kw: code.insert_after(*args, **kw)
@@ -281,6 +298,7 @@ def test_insert_after():
]
_test_search(meth, expected)


def test_replace():
"""test Wikicode.replace()"""
meth = lambda code, *args, **kw: code.replace(*args, **kw)
@@ -297,6 +315,7 @@ def test_replace():
]
_test_search(meth, expected)


def test_append():
"""test Wikicode.append()"""
code = parse("Have a {{template}}")
@@ -310,6 +329,7 @@ def test_append():
with pytest.raises(ValueError):
code.append(slice(0, 1))


def test_remove():
"""test Wikicode.remove()"""
meth = lambda code, obj, value, **kw: code.remove(obj, **kw)
@@ -326,6 +346,7 @@ def test_remove():
]
_test_search(meth, expected)


def test_matches():
"""test Wikicode.matches()"""
code1 = parse("Cleanup")
@@ -357,17 +378,32 @@ def test_matches():
assert code5.matches("<!-- nothing -->") is True
assert code5.matches(("a", "b", "")) is True


def test_filter_family():
"""test the Wikicode.i?filter() family of functions"""

def genlist(gen):
assert isinstance(gen, GeneratorType)
return list(gen)

ifilter = lambda code: (lambda *a, **k: genlist(code.ifilter(*a, **k)))

code = parse("a{{b}}c[[d]]{{{e}}}{{f}}[[g]]")
for func in (code.filter, ifilter(code)):
assert ["a", "{{b}}", "b", "c", "[[d]]", "d", "{{{e}}}",
"e", "{{f}}", "f", "[[g]]", "g"] == func()
assert [
"a",
"{{b}}",
"b",
"c",
"[[d]]",
"d",
"{{{e}}}",
"e",
"{{f}}",
"f",
"[[g]]",
"g",
] == func()
assert ["{{{e}}}"] == func(forcetype=Argument)
assert code.get(4) is func(forcetype=Argument)[0]
assert list("abcdefg") == func(forcetype=Text)
@@ -377,7 +413,7 @@ def test_filter_family():

funcs = [
lambda name, **kw: getattr(code, "filter_" + name)(**kw),
lambda name, **kw: genlist(getattr(code, "ifilter_" + name)(**kw))
lambda name, **kw: genlist(getattr(code, "ifilter_" + name)(**kw)),
]
for get_filter in funcs:
assert ["{{{e}}}"] == get_filter("arguments")
@@ -393,27 +429,35 @@ def test_filter_family():

code2 = parse("{{a|{{b}}|{{c|d={{f}}{{h}}}}}}")
for func in (code2.filter, ifilter(code2)):
assert ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"] \
== func(recursive=False, forcetype=Template)
assert ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}", "{{b}}",
"{{c|d={{f}}{{h}}}}", "{{f}}", "{{h}}"] \
== func(recursive=True, forcetype=Template)
assert ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"] == func(
recursive=False, forcetype=Template
)
assert [
"{{a|{{b}}|{{c|d={{f}}{{h}}}}}}",
"{{b}}",
"{{c|d={{f}}{{h}}}}",
"{{f}}",
"{{h}}",
] == func(recursive=True, forcetype=Template)

code3 = parse("{{foobar}}{{FOO}}{{baz}}{{bz}}{{barfoo}}")
for func in (code3.filter, ifilter(code3)):
assert ["{{foobar}}", "{{barfoo}}"] \
== func(False, matches=lambda node: "foo" in node)
assert ["{{foobar}}", "{{FOO}}", "{{barfoo}}"] \
== func(False, matches=r"foo")
assert ["{{foobar}}", "{{FOO}}"] \
== func(matches=r"^{{foo.*?}}")
assert ["{{foobar}}"] \
== func(matches=r"^{{foo.*?}}", flags=re.UNICODE)
assert ["{{foobar}}", "{{barfoo}}"] == func(
False, matches=lambda node: "foo" in node
)
assert ["{{foobar}}", "{{FOO}}", "{{barfoo}}"] == func(False, matches=r"foo")
assert ["{{foobar}}", "{{FOO}}"] == func(matches=r"^{{foo.*?}}")
assert ["{{foobar}}"] == func(matches=r"^{{foo.*?}}", flags=re.UNICODE)
assert ["{{baz}}", "{{bz}}"] == func(matches=r"^{{b.*?z")
assert ["{{baz}}"] == func(matches=r"^{{b.+?z}}")

exp_rec = ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}", "{{b}}",
"{{c|d={{f}}{{h}}}}", "{{f}}", "{{h}}"]
exp_rec = [
"{{a|{{b}}|{{c|d={{f}}{{h}}}}}}",
"{{b}}",
"{{c|d={{f}}{{h}}}}",
"{{f}}",
"{{h}}",
]
exp_unrec = ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"]
assert exp_rec == code2.filter_templates()
assert exp_unrec == code2.filter_templates(recursive=False)
@@ -422,9 +466,9 @@ def test_filter_family():
assert exp_unrec == code2.filter_templates(False)

assert ["{{foobar}}"] == code3.filter_templates(
matches=lambda node: node.name.matches("Foobar"))
assert ["{{baz}}", "{{bz}}"] \
== code3.filter_templates(matches=r"^{{b.*?z")
matches=lambda node: node.name.matches("Foobar")
)
assert ["{{baz}}", "{{bz}}"] == code3.filter_templates(matches=r"^{{b.*?z")
assert [] == code3.filter_tags(matches=r"^{{b.*?z")
assert [] == code3.filter_tags(matches=r"^{{b.*?z", flags=0)
with pytest.raises(TypeError):
@@ -440,6 +484,7 @@ def test_filter_family():
assert ["{{foo}}", "{{foo|{{bar}}}}"] == actual1
assert ["{{foo}}", "{{foo|{{bar}}}}"] == actual2


def test_get_sections():
"""test Wikicode.get_sections()"""
page1 = parse("")
@@ -461,44 +506,70 @@ def test_get_sections():

assert [""] == page1.get_sections()
assert ["", "==Heading=="] == page2.get_sections()
assert ["", "===Heading===\nFoo bar baz\n====Gnidaeh====\n", "====Gnidaeh====\n"] \
== page3.get_sections()
assert [p4_lead, p4_I, p4_IA, p4_IB, p4_IB1, p4_II,
p4_III, p4_IIIA, p4_IIIA1a, p4_IIIA2, p4_IIIA2ai1] \
== page4.get_sections()
assert [
"",
"===Heading===\nFoo bar baz\n====Gnidaeh====\n",
"====Gnidaeh====\n",
] == page3.get_sections()
assert [
p4_lead,
p4_I,
p4_IA,
p4_IB,
p4_IB1,
p4_II,
p4_III,
p4_IIIA,
p4_IIIA1a,
p4_IIIA2,
p4_IIIA2ai1,
] == page4.get_sections()

assert ["====Gnidaeh====\n"] == page3.get_sections(levels=[4])
assert ["===Heading===\nFoo bar baz\n====Gnidaeh====\n"] \
== page3.get_sections(levels=(2, 3))
assert ["===Heading===\nFoo bar baz\n"] \
== page3.get_sections(levels=(2, 3), flat=True)
assert ["===Heading===\nFoo bar baz\n====Gnidaeh====\n"] == page3.get_sections(
levels=(2, 3)
)
assert ["===Heading===\nFoo bar baz\n"] == page3.get_sections(
levels=(2, 3), flat=True
)
assert [] == page3.get_sections(levels=[0])
assert ["", "====Gnidaeh====\n"] == page3.get_sections(levels=[4], include_lead=True)
assert ["===Heading===\nFoo bar baz\n====Gnidaeh====\n",
"====Gnidaeh====\n"] == page3.get_sections(include_lead=False)
assert ["===Heading===\nFoo bar baz\n", "====Gnidaeh====\n"] \
== page3.get_sections(flat=True, include_lead=False)
assert ["", "====Gnidaeh====\n"] == page3.get_sections(
levels=[4], include_lead=True
)
assert [
"===Heading===\nFoo bar baz\n====Gnidaeh====\n",
"====Gnidaeh====\n",
] == page3.get_sections(include_lead=False)
assert ["===Heading===\nFoo bar baz\n", "====Gnidaeh====\n"] == page3.get_sections(
flat=True, include_lead=False
)

assert [p4_IB1, p4_IIIA2] == page4.get_sections(levels=[4])
assert [p4_IA, p4_IB, p4_IIIA] == page4.get_sections(levels=[3])
assert [p4_IA, "=== Section I.B ===\n",
"=== Section III.A ===\nText.\n"] \
== page4.get_sections(levels=[3], flat=True)
assert [
p4_IA,
"=== Section I.B ===\n",
"=== Section III.A ===\nText.\n",
] == page4.get_sections(levels=[3], flat=True)
assert ["", ""] == page2.get_sections(include_headings=False)
assert ["\nSection I.B.1 body.\n\n&bull;Some content.\n\n",
"\nEven more text.\n" + p4_IIIA2ai1] \
== page4.get_sections(levels=[4], include_headings=False)
assert [
"\nSection I.B.1 body.\n\n&bull;Some content.\n\n",
"\nEven more text.\n" + p4_IIIA2ai1,
] == page4.get_sections(levels=[4], include_headings=False)

assert [] == page4.get_sections(matches=r"body")
assert [p4_I, p4_IA, p4_IB, p4_IB1] \
== page4.get_sections(matches=r"Section\sI[.\s].*?")
assert [p4_IA, p4_IIIA, p4_IIIA1a, p4_IIIA2, p4_IIIA2ai1] \
== page4.get_sections(matches=r".*?a.*?")
assert [p4_IIIA1a, p4_IIIA2ai1] \
== page4.get_sections(matches=r".*?a.*?", flags=re.U)
assert ["\nMore text.\n", "\nAn invalid section!"] \
== page4.get_sections(matches=r".*?a.*?", flags=re.U,
include_headings=False)
assert [p4_I, p4_IA, p4_IB, p4_IB1] == page4.get_sections(
matches=r"Section\sI[.\s].*?"
)
assert [p4_IA, p4_IIIA, p4_IIIA1a, p4_IIIA2, p4_IIIA2ai1] == page4.get_sections(
matches=r".*?a.*?"
)
assert [p4_IIIA1a, p4_IIIA2ai1] == page4.get_sections(
matches=r".*?a.*?", flags=re.U
)
assert ["\nMore text.\n", "\nAn invalid section!"] == page4.get_sections(
matches=r".*?a.*?", flags=re.U, include_headings=False
)

sections = page2.get_sections(include_headings=False)
sections[0].append("Lead!\n")
@@ -512,22 +583,22 @@ def test_get_sections():
assert "== Foo ==\nBarf {{Haha}}\n" == section
assert "X\n== Foo ==\nBarf {{Haha}}\n== Baz ==\nBuzz" == page5


def test_strip_code():
"""test Wikicode.strip_code()"""
# Since individual nodes have test cases for their __strip__ methods,
# we're only going to do an integration test:
code = parse("Foo [[bar]]\n\n{{baz|hello}}\n\n[[a|b]] &Sigma;")
assert "Foo bar\n\nb Σ" \
== code.strip_code(normalize=True, collapse=True)
assert "Foo bar\n\n\n\nb Σ" \
== code.strip_code(normalize=True, collapse=False)
assert "Foo bar\n\nb &Sigma;" \
== code.strip_code(normalize=False, collapse=True)
assert "Foo bar\n\n\n\nb &Sigma;" \
== code.strip_code(normalize=False, collapse=False)
assert "Foo bar\n\nhello\n\nb Σ" \
== code.strip_code(normalize=True, collapse=True,
keep_template_params=True)
assert "Foo bar\n\nb Σ" == code.strip_code(normalize=True, collapse=True)
assert "Foo bar\n\n\n\nb Σ" == code.strip_code(normalize=True, collapse=False)
assert "Foo bar\n\nb &Sigma;" == code.strip_code(normalize=False, collapse=True)
assert "Foo bar\n\n\n\nb &Sigma;" == code.strip_code(
normalize=False, collapse=False
)
assert "Foo bar\n\nhello\n\nb Σ" == code.strip_code(
normalize=True, collapse=True, keep_template_params=True
)


def test_get_tree():
"""test Wikicode.get_tree()"""
@@ -535,6 +606,8 @@ def test_get_tree():
# methods, and the docstring covers all possibilities for the output of
# __showtree__, we'll test it only:
code = parse("Lorem ipsum {{foo|bar|{{baz}}|spam=eggs}}")
expected = "Lorem ipsum \n{{\n\t foo\n\t| 1\n\t= bar\n\t| 2\n\t= " + \
"{{\n\t\t\tbaz\n\t }}\n\t| spam\n\t= eggs\n}}"
expected = (
"Lorem ipsum \n{{\n\t foo\n\t| 1\n\t= bar\n\t| 2\n\t= "
+ "{{\n\t\t\tbaz\n\t }}\n\t| spam\n\t= eggs\n}}"
)
assert expected.expandtabs(4) == code.get_tree()

+ 16
- 2
tests/test_wikilink.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes import Text, Wikilink
from .conftest import assert_wikicode_equal, wrap, wraptext


def test_str():
"""test Wikilink.__str__()"""
node = Wikilink(wraptext("foobar"))
@@ -34,6 +35,7 @@ def test_str():
node2 = Wikilink(wraptext("foo"), wraptext("bar"))
assert "[[foo|bar]]" == str(node2)


def test_children():
"""test Wikilink.__children__()"""
node1 = Wikilink(wraptext("foobar"))
@@ -48,6 +50,7 @@ def test_children():
with pytest.raises(StopIteration):
next(gen2)


def test_strip():
"""test Wikilink.__strip__()"""
node = Wikilink(wraptext("foobar"))
@@ -55,6 +58,7 @@ def test_strip():
assert "foobar" == node.__strip__()
assert "bar" == node2.__strip__()


def test_showtree():
"""test Wikilink.__showtree__()"""
output = []
@@ -66,10 +70,19 @@ def test_showtree():
node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark)
valid = [
"[[", (getter, node1.title), "]]", "[[", (getter, node2.title),
" | ", marker, (getter, node2.text), "]]"]
"[[",
(getter, node1.title),
"]]",
"[[",
(getter, node2.title),
" | ",
marker,
(getter, node2.text),
"]]",
]
assert valid == output


def test_title():
"""test getter/setter for the title attribute"""
title = wraptext("foobar")
@@ -82,6 +95,7 @@ def test_title():
assert_wikicode_equal(wraptext("héhehé"), node1.title)
assert_wikicode_equal(wraptext("héhehé"), node2.title)


def test_text():
"""test getter/setter for the text attribute"""
text = wraptext("baz")


Loading…
Cancel
Save