Browse Source

Merge branch 'develop' into sections_in_templates

pull/233/head
Ben Kurtovic 3 years ago
committed by GitHub
parent
commit
314dd19c47
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
84 changed files with 1258 additions and 1456 deletions
  1. +24
    -0
      .github/workflows/build-linux-wheels.yml
  2. +30
    -0
      .github/workflows/build-macos-wheels.yml
  3. +4
    -2
      .travis.yml
  4. +9
    -1
      CHANGELOG
  5. +2
    -5
      README.rst
  6. +8
    -16
      appveyor.yml
  7. +8
    -0
      docs/api/mwparserfromhell.nodes.rst
  8. +7
    -0
      docs/api/mwparserfromhell.parser.rst
  9. +1
    -13
      docs/api/mwparserfromhell.rst
  10. +30
    -0
      docs/api/mwparserfromhell.smart_list.rst
  11. +13
    -3
      docs/changelog.rst
  12. +1
    -1
      docs/conf.py
  13. +1
    -1
      docs/index.rst
  14. +1
    -3
      docs/usage.rst
  15. +3
    -5
      mwparserfromhell/__init__.py
  16. +0
    -27
      mwparserfromhell/compat.py
  17. +60
    -21
      mwparserfromhell/definitions.py
  18. +5
    -40
      mwparserfromhell/nodes/__init__.py
  19. +50
    -0
      mwparserfromhell/nodes/_base.py
  20. +4
    -8
      mwparserfromhell/nodes/argument.py
  21. +4
    -8
      mwparserfromhell/nodes/comment.py
  22. +5
    -8
      mwparserfromhell/nodes/external_link.py
  23. +0
    -2
      mwparserfromhell/nodes/extras/__init__.py
  24. +3
    -7
      mwparserfromhell/nodes/extras/attribute.py
  25. +3
    -7
      mwparserfromhell/nodes/extras/parameter.py
  26. +4
    -8
      mwparserfromhell/nodes/heading.py
  27. +19
    -47
      mwparserfromhell/nodes/html_entity.py
  28. +7
    -12
      mwparserfromhell/nodes/tag.py
  29. +35
    -22
      mwparserfromhell/nodes/template.py
  30. +4
    -8
      mwparserfromhell/nodes/text.py
  31. +4
    -8
      mwparserfromhell/nodes/wikilink.py
  32. +3
    -17
      mwparserfromhell/parser/__init__.py
  33. +7
    -12
      mwparserfromhell/parser/builder.py
  34. +0
    -2
      mwparserfromhell/parser/contexts.py
  35. +1
    -25
      mwparserfromhell/parser/ctokenizer/common.h
  36. +62
    -12
      mwparserfromhell/parser/ctokenizer/definitions.c
  37. +1
    -1
      mwparserfromhell/parser/ctokenizer/tag_data.h
  38. +6
    -49
      mwparserfromhell/parser/ctokenizer/textbuffer.c
  39. +2
    -2
      mwparserfromhell/parser/ctokenizer/textbuffer.h
  40. +26
    -30
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  41. +1
    -1
      mwparserfromhell/parser/ctokenizer/tok_parse.h
  42. +4
    -8
      mwparserfromhell/parser/ctokenizer/tok_support.c
  43. +3
    -3
      mwparserfromhell/parser/ctokenizer/tok_support.h
  44. +9
    -25
      mwparserfromhell/parser/ctokenizer/tokenizer.c
  45. +0
    -18
      mwparserfromhell/parser/ctokenizer/tokenizer.h
  46. +33
    -0
      mwparserfromhell/parser/errors.py
  47. +79
    -80
      mwparserfromhell/parser/tokenizer.py
  48. +2
    -8
      mwparserfromhell/parser/tokens.py
  49. +0
    -456
      mwparserfromhell/smart_list.py
  50. +29
    -0
      mwparserfromhell/smart_list/__init__.py
  51. +229
    -0
      mwparserfromhell/smart_list/list_proxy.py
  52. +158
    -0
      mwparserfromhell/smart_list/smart_list.py
  53. +50
    -0
      mwparserfromhell/smart_list/utils.py
  54. +28
    -46
      mwparserfromhell/string_mixin.py
  55. +20
    -25
      mwparserfromhell/utils.py
  56. +20
    -25
      mwparserfromhell/wikicode.py
  57. +7
    -16
      scripts/memtest.py
  58. +4
    -12
      setup.py
  59. +0
    -1
      tests/__init__.py
  60. +5
    -12
      tests/_test_tokenizer.py
  61. +1
    -6
      tests/_test_tree_equality.py
  62. +0
    -18
      tests/compat.py
  63. +3
    -7
      tests/test_argument.py
  64. +3
    -7
      tests/test_attribute.py
  65. +1
    -6
      tests/test_builder.py
  66. +3
    -7
      tests/test_comment.py
  67. +0
    -3
      tests/test_ctokenizer.py
  68. +15
    -34
      tests/test_docs.py
  69. +3
    -7
      tests/test_external_link.py
  70. +3
    -7
      tests/test_heading.py
  71. +4
    -8
      tests/test_html_entity.py
  72. +4
    -9
      tests/test_parameter.py
  73. +0
    -4
      tests/test_parser.py
  74. +0
    -3
      tests/test_pytokenizer.py
  75. +0
    -3
      tests/test_roundtripping.py
  76. +17
    -29
      tests/test_smart_list.py
  77. +42
    -72
      tests/test_string_mixin.py
  78. +4
    -8
      tests/test_tag.py
  79. +3
    -7
      tests/test_template.py
  80. +3
    -7
      tests/test_text.py
  81. +4
    -16
      tests/test_tokens.py
  82. +0
    -3
      tests/test_utils.py
  83. +4
    -9
      tests/test_wikicode.py
  84. +3
    -7
      tests/test_wikilink.py

+ 24
- 0
.github/workflows/build-linux-wheels.yml View File

@@ -0,0 +1,24 @@
name: Build manylinux1 wheels

on: push

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Build manylinux1 Python wheels
uses: RalfG/python-wheels-manylinux-build@0c24cb31441c7a1e6ea90d6a6408d406b2fee279
with:
python-versions: 'cp35-cp35m cp36-cp36m cp37-cp37m cp38-cp38 cp39-cp39'
- name: Move to dist/
run: |
mkdir -p dist
cp -v wheelhouse/*-manylinux1_x86_64.whl dist/
- name: Publish package to PyPI
# Only actually publish if a new tag was pushed
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@37e305e7413032d8422456179fee28fac7d25187
with:
user: __token__
password: ${{ secrets.pypi_password }}

+ 30
- 0
.github/workflows/build-macos-wheels.yml View File

@@ -0,0 +1,30 @@
name: Build macOS wheels

on: push

jobs:
build:
runs-on: macos-latest
strategy:
matrix:
python-version: [3.5, 3.6, 3.7, 3.8, 3.9]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python-version }}
- name: Build wheels
run: |
python -m pip install --upgrade pip wheel setuptools
pip wheel . -w dist/
ls dist/
- name: Publish package to PyPI
# Only actually publish if a new tag was pushed
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
# We can't use the pypa action because of https://github.com/pypa/gh-action-pypi-publish/issues/15
run: |
pip install twine
TWINE_USERNAME="__token__" \
TWINE_PASSWORD="${{ secrets.pypi_password }}" \
twine upload dist/*

+ 4
- 2
.travis.yml View File

@@ -1,12 +1,14 @@
dist: xenial dist: xenial
language: python language: python
python: python:
- 2.7
- 3.4
- 3.5 - 3.5
- 3.6 - 3.6
- 3.7 - 3.7
- 3.8 - 3.8
- 3.9
arch:
- amd64
- ppc64le
install: install:
- pip install coveralls - pip install coveralls
- python setup.py develop - python setup.py develop


+ 9
- 1
CHANGELOG View File

@@ -1,9 +1,17 @@
v0.6 (unreleased): v0.6 (unreleased):


- Added support for Python 3.8.
Thanks to everyone for their patience with this release!

- Breaking change: dropped support for end-of-life Python 2.7 and 3.4.
- Added support for Python 3.8 and 3.9.
- Added binary wheels for Linux and macOS.
- Updated Wikicode.matches() to recognize underscores as being equivalent - Updated Wikicode.matches() to recognize underscores as being equivalent
to spaces. (#216) to spaces. (#216)
- Add a 'default' parameter to Template.get, and implement dict-style item
access for template parameters. (#252)
- Fixed a rare parsing bug involving deeply nested style tags. (#224) - Fixed a rare parsing bug involving deeply nested style tags. (#224)
- Updated HTML tag definitions.
- Internal refactoring and cleanup.


v0.5.4 (released May 15, 2019): v0.5.4 (released May 15, 2019):




+ 2
- 5
README.rst View File

@@ -11,7 +11,7 @@ mwparserfromhell


**mwparserfromhell** (the *MediaWiki Parser from Hell*) is a Python package **mwparserfromhell** (the *MediaWiki Parser from Hell*) is a Python package
that provides an easy-to-use and outrageously powerful parser for MediaWiki_ that provides an easy-to-use and outrageously powerful parser for MediaWiki_
wikicode. It supports Python 2 and Python 3.
wikicode. It supports Python 3.5+.


Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others.
Full documentation is available on ReadTheDocs_. Development occurs on GitHub_. Full documentation is available on ReadTheDocs_. Development occurs on GitHub_.
@@ -41,8 +41,7 @@ Normal usage is rather straightforward (where ``text`` is page text):
>>> wikicode = mwparserfromhell.parse(text) >>> wikicode = mwparserfromhell.parse(text)


``wikicode`` is a ``mwparserfromhell.Wikicode`` object, which acts like an ``wikicode`` is a ``mwparserfromhell.Wikicode`` object, which acts like an
ordinary ``str`` object (or ``unicode`` in Python 2) with some extra methods.
For example:
ordinary ``str`` object with some extra methods. For example:


>>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?"
>>> wikicode = mwparserfromhell.parse(text) >>> wikicode = mwparserfromhell.parse(text)
@@ -111,8 +110,6 @@ saving the page!) by calling ``str()`` on it:
>>> text == code >>> text == code
True True


Likewise, use ``unicode(code)`` in Python 2.

Limitations Limitations
----------- -----------




+ 8
- 16
appveyor.yml View File

@@ -22,22 +22,6 @@ environment:
secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+ secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+


matrix: matrix:
- PYTHON: "C:\\Python27"
PYTHON_VERSION: "2.7"
PYTHON_ARCH: "32"

- PYTHON: "C:\\Python27-x64"
PYTHON_VERSION: "2.7"
PYTHON_ARCH: "64"

- PYTHON: "C:\\Python34"
PYTHON_VERSION: "3.4"
PYTHON_ARCH: "32"

- PYTHON: "C:\\Python34-x64"
PYTHON_VERSION: "3.4"
PYTHON_ARCH: "64"

- PYTHON: "C:\\Python35" - PYTHON: "C:\\Python35"
PYTHON_VERSION: "3.5" PYTHON_VERSION: "3.5"
PYTHON_ARCH: "32" PYTHON_ARCH: "32"
@@ -70,6 +54,14 @@ environment:
PYTHON_VERSION: "3.8" PYTHON_VERSION: "3.8"
PYTHON_ARCH: "64" PYTHON_ARCH: "64"


- PYTHON: "C:\\Python39"
PYTHON_VERSION: "3.9"
PYTHON_ARCH: "32"

- PYTHON: "C:\\Python39-x64"
PYTHON_VERSION: "3.9"
PYTHON_ARCH: "64"

install: install:
- "%PIP% install --disable-pip-version-check --user --upgrade pip" - "%PIP% install --disable-pip-version-check --user --upgrade pip"
- "%PIP% install wheel twine" - "%PIP% install wheel twine"


+ 8
- 0
docs/api/mwparserfromhell.nodes.rst View File

@@ -9,6 +9,14 @@ nodes Package
.. autoclass:: mwparserfromhell.nodes.Node .. autoclass:: mwparserfromhell.nodes.Node
:special-members: :special-members:


:mod:`_base` Module
----------------------

.. automodule:: mwparserfromhell.nodes._base
:members:
:undoc-members:
:show-inheritance:

:mod:`argument` Module :mod:`argument` Module
---------------------- ----------------------




+ 7
- 0
docs/api/mwparserfromhell.parser.rst View File

@@ -23,6 +23,13 @@ parser Package
:members: :members:
:undoc-members: :undoc-members:


:mod:`errors` Module
--------------------

.. automodule:: mwparserfromhell.parser.errors
:members:
:undoc-members:

:mod:`tokenizer` Module :mod:`tokenizer` Module
----------------------- -----------------------




+ 1
- 13
docs/api/mwparserfromhell.rst View File

@@ -8,27 +8,12 @@ mwparserfromhell Package
:members: :members:
:undoc-members: :undoc-members:


:mod:`compat` Module

.. automodule:: mwparserfromhell.compat
:members:
:undoc-members:

:mod:`definitions` Module :mod:`definitions` Module
------------------------- -------------------------


.. automodule:: mwparserfromhell.definitions .. automodule:: mwparserfromhell.definitions
:members: :members:


:mod:`smart_list` Module

.. automodule:: mwparserfromhell.smart_list
:members: SmartList, _ListProxy
:undoc-members:
:show-inheritance:

:mod:`string_mixin` Module :mod:`string_mixin` Module
-------------------------- --------------------------


@@ -58,3 +43,4 @@ Subpackages


mwparserfromhell.nodes mwparserfromhell.nodes
mwparserfromhell.parser mwparserfromhell.parser
mwparserfromhell.smart_list

+ 30
- 0
docs/api/mwparserfromhell.smart_list.rst View File

@@ -0,0 +1,30 @@
smart_list Package
==================

:mod:`smart_list` Package
-------------------------

.. automodule:: mwparserfromhell.smart_list
:members:
:undoc-members:

:mod:`list_proxy` Module
---------------------

.. automodule:: mwparserfromhell.smart_list.list_proxy
:members:
:undoc-members:

:mod:`smart_list` Module
---------------------

.. automodule:: mwparserfromhell.smart_list.smart_list
:members:
:undoc-members:

:mod:`utils` Module
---------------------

.. automodule:: mwparserfromhell.smart_list.utils
:members:
:undoc-members:

+ 13
- 3
docs/changelog.rst View File

@@ -7,11 +7,21 @@ v0.6
Unreleased Unreleased
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.5.4...develop>`__): (`changes <https://github.com/earwig/mwparserfromhell/compare/v0.5.4...develop>`__):


- Added support for Python 3.8.
- Updated Wikicode.matches() to recognize underscores as being equivalent
to spaces. (`#216 <https://github.com/earwig/mwparserfromhell/issues/216>`_)
Thanks to everyone for their patience with this release!

- Breaking change: dropped support for end-of-life Python 2.7 and 3.4.
- Added support for Python 3.8 and 3.9.
- Added binary wheels for Linux and macOS.
- Updated :meth:`.Wikicode.matches` to recognize underscores as being
equivalent to spaces.
(`#216 <https://github.com/earwig/mwparserfromhell/issues/216>`_)
- Add a `default` parameter to :meth:`.Template.get`, and implement dict-style
item access for template parameters.
(`#252 <https://github.com/earwig/mwparserfromhell/issues/252>`_)
- Fixed a rare parsing bug involving deeply nested style tags. - Fixed a rare parsing bug involving deeply nested style tags.
(`#224 <https://github.com/earwig/mwparserfromhell/issues/224>`_) (`#224 <https://github.com/earwig/mwparserfromhell/issues/224>`_)
- Updated HTML tag definitions.
- Internal refactoring and cleanup.


v0.5.4 v0.5.4
------ ------


+ 1
- 1
docs/conf.py View File

@@ -42,7 +42,7 @@ master_doc = 'index'


# General information about the project. # General information about the project.
project = u'mwparserfromhell' project = u'mwparserfromhell'
copyright = u'2012–2019 Ben Kurtovic'
copyright = u'2012–2020 Ben Kurtovic'


# The version info for the project you're documenting, acts as replacement for # The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the # |version| and |release|, also used in various other places throughout the


+ 1
- 1
docs/index.rst View File

@@ -3,7 +3,7 @@ MWParserFromHell v\ |version| Documentation


:mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python package :mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python package
that provides an easy-to-use and outrageously powerful parser for MediaWiki_ that provides an easy-to-use and outrageously powerful parser for MediaWiki_
wikicode. It supports Python 2 and Python 3.
wikicode. It supports Python 3.5+.


Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others.
Development occurs on GitHub_. Development occurs on GitHub_.


+ 1
- 3
docs/usage.rst View File

@@ -7,8 +7,7 @@ Normal usage is rather straightforward (where ``text`` is page text)::
>>> wikicode = mwparserfromhell.parse(text) >>> wikicode = mwparserfromhell.parse(text)


``wikicode`` is a :class:`mwparserfromhell.Wikicode <.Wikicode>` object, which ``wikicode`` is a :class:`mwparserfromhell.Wikicode <.Wikicode>` object, which
acts like an ordinary ``str`` object (or ``unicode`` in Python 2) with some
extra methods. For example::
acts like an ordinary ``str`` object with some extra methods. For example::


>>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?"
>>> wikicode = mwparserfromhell.parse(text) >>> wikicode = mwparserfromhell.parse(text)
@@ -78,7 +77,6 @@ saving the page!) by calling :func:`str` on it::
>>> text == code >>> text == code
True True


(Likewise, use :func:`unicode(code) <unicode>` in Python 2.)


For more tips, check out :class:`Wikicode's full method list <.Wikicode>` and For more tips, check out :class:`Wikicode's full method list <.Wikicode>` and
the :mod:`list of Nodes <.nodes>`. the :mod:`list of Nodes <.nodes>`.

+ 3
- 5
mwparserfromhell/__init__.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -27,12 +25,12 @@ outrageously powerful parser for `MediaWiki <https://www.mediawiki.org>`_ wikico
""" """


__author__ = "Ben Kurtovic" __author__ = "Ben Kurtovic"
__copyright__ = "Copyright (C) 2012-2019 Ben Kurtovic"
__copyright__ = "Copyright (C) 2012-2020 Ben Kurtovic"
__license__ = "MIT License" __license__ = "MIT License"
__version__ = "0.6.dev0" __version__ = "0.6.dev0"
__email__ = "ben.kurtovic@gmail.com" __email__ = "ben.kurtovic@gmail.com"


from . import (compat, definitions, nodes, parser, smart_list, string_mixin,
from . import (definitions, nodes, parser, smart_list, string_mixin,
utils, wikicode) utils, wikicode)


parse = utils.parse_anything parse = utils.parse_anything

+ 0
- 27
mwparserfromhell/compat.py View File

@@ -1,27 +0,0 @@
# -*- coding: utf-8 -*-

"""
Implements support for both Python 2 and Python 3 by defining common types in
terms of their Python 2/3 variants. For example, :class:`str` is set to
:class:`unicode` on Python 2 but :class:`str` on Python 3; likewise,
:class:`bytes` is :class:`str` on 2 but :class:`bytes` on 3. These types are
meant to be imported directly from within the parser's modules.
"""

import sys

py3k = (sys.version_info[0] == 3)

if py3k:
bytes = bytes
str = str
range = range
import html.entities as htmlentities

else:
bytes = str
str = unicode
range = xrange
import htmlentitydefs as htmlentities

del sys

+ 60
- 21
mwparserfromhell/definitions.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -28,36 +26,77 @@ When updating this file, please also update the the C tokenizer version:
- mwparserfromhell/parser/ctokenizer/definitions.h - mwparserfromhell/parser/ctokenizer/definitions.h
""" """


from __future__ import unicode_literals

__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only", "is_scheme"] "is_single_only", "is_scheme"]


URI_SCHEMES = { URI_SCHEMES = {
# [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943
"http": True, "https": True, "ftp": True, "ftps": True, "ssh": True,
"sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False,
"sips": False, "gopher": True, "telnet": True, "nntp": True,
"worldwind": True, "mailto": False, "tel": False, "sms": False,
"news": False, "svn": True, "git": True, "mms": True, "bitcoin": False,
"magnet": False, "urn": False, "geo": False
# [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0
"bitcoin": False,
"ftp": True,
"ftps": True,
"geo": False,
"git": True,
"gopher": True,
"http": True,
"https": True,
"irc": True,
"ircs": True,
"magnet": False,
"mailto": False,
"mms": True,
"news": False,
"nntp": True,
"redis": True,
"sftp": True,
"sip": False,
"sips": False,
"sms": False,
"ssh": True,
"svn": True,
"tel": False,
"telnet": True,
"urn": False,
"worldwind": True,
"xmpp": False,
} }


PARSER_BLACKLIST = [ PARSER_BLACKLIST = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
"nowiki", "pre", "score", "section", "source", "syntaxhighlight",
"templatedata", "timeline"
# https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
"categorytree",
"ce",
"chem",
"gallery",
"graph",
"hiero",
"imagemap",
"inputbox",
"math",
"nowiki",
"pre",
"score",
"section",
"source",
"syntaxhighlight",
"templatedata",
"timeline",
] ]


INVISIBLE_TAGS = [ INVISIBLE_TAGS = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "imagemap", "inputbox", "math", "score",
"section", "templatedata", "timeline"
# https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
"categorytree",
"gallery",
"graph",
"imagemap",
"inputbox",
"math",
"score",
"section",
"templatedata",
"timeline"
] ]


# [mediawiki/core.git]/includes/Sanitizer.php @ 065bec63ea
SINGLE_ONLY = ["br", "hr", "meta", "link", "img", "wbr"]
# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645
SINGLE_ONLY = ["br", "wbr", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]


MARKUP_TO_HTML = { MARKUP_TO_HTML = {


+ 5
- 40
mwparserfromhell/nodes/__init__.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -29,44 +27,8 @@ the name of a :class:`.Template` is a :class:`.Wikicode` object that can
contain text or more templates. contain text or more templates.
""" """


from __future__ import unicode_literals

from ..compat import str
from ..string_mixin import StringMixIn

__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading",
"Node", "Tag", "Template", "Text", "Wikilink"]

class Node(StringMixIn):
"""Represents the base Node type, demonstrating the methods to override.

:meth:`__unicode__` must be overridden. It should return a ``unicode`` or
(``str`` in py3k) representation of the node. If the node contains
:class:`.Wikicode` objects inside of it, :meth:`__children__` should be a
generator that iterates over them. If the node is printable
(shown when the page is rendered), :meth:`__strip__` should return its
printable version, stripping out any formatting marks. It does not have to
return a string, but something that can be converted to a string with
``str()``. Finally, :meth:`__showtree__` can be overridden to build a
nice tree representation of the node, if desired, for
:meth:`~.Wikicode.get_tree`.
"""
def __unicode__(self):
raise NotImplementedError()

def __children__(self):
return
# pylint: disable=unreachable
yield # pragma: no cover (this is a generator that yields nothing)

def __strip__(self, **kwargs):
return None

def __showtree__(self, write, get, mark):
write(str(self))


from . import extras from . import extras
from ._base import Node
from .text import Text from .text import Text
from .argument import Argument from .argument import Argument
from .comment import Comment from .comment import Comment
@@ -76,3 +38,6 @@ from .html_entity import HTMLEntity
from .tag import Tag from .tag import Tag
from .template import Template from .template import Template
from .wikilink import Wikilink from .wikilink import Wikilink

__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading",
"Node", "Tag", "Template", "Text", "Wikilink"]

+ 50
- 0
mwparserfromhell/nodes/_base.py View File

@@ -0,0 +1,50 @@
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from ..string_mixin import StringMixIn

__all__ = ["Node"]

class Node(StringMixIn):
"""Represents the base Node type, demonstrating the methods to override.

:meth:`__str__` must be overridden. It should return a ``str``
representation of the node. If the node contains :class:`.Wikicode`
objects inside of it, :meth:`__children__` should be a generator that
iterates over them. If the node is printable (shown when the page is
rendered), :meth:`__strip__` should return its printable version,
stripping out any formatting marks. It does not have to return a string,
but something that can be converted to a string with ``str()``. Finally,
:meth:`__showtree__` can be overridden to build a nice tree representation
of the node, if desired, for :meth:`~.Wikicode.get_tree`.
"""
def __str__(self):
raise NotImplementedError()

def __children__(self):
return
# pylint: disable=unreachable
yield # pragma: no cover (this is a generator that yields nothing)

def __strip__(self, **kwargs):
return None

def __showtree__(self, write, get, mark):
write(str(self))

+ 4
- 8
mwparserfromhell/nodes/argument.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from . import Node
from ..compat import str
from ._base import Node
from ..utils import parse_anything from ..utils import parse_anything


__all__ = ["Argument"] __all__ = ["Argument"]
@@ -32,11 +28,11 @@ class Argument(Node):
"""Represents a template argument substitution, like ``{{{foo}}}``.""" """Represents a template argument substitution, like ``{{{foo}}}``."""


def __init__(self, name, default=None): def __init__(self, name, default=None):
super(Argument, self).__init__()
super().__init__()
self.name = name self.name = name
self.default = default self.default = default


def __unicode__(self):
def __str__(self):
start = "{{{" + str(self.name) start = "{{{" + str(self.name)
if self.default is not None: if self.default is not None:
return start + "|" + str(self.default) + "}}}" return start + "|" + str(self.default) + "}}}"


+ 4
- 8
mwparserfromhell/nodes/comment.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from . import Node
from ..compat import str
from ._base import Node


__all__ = ["Comment"] __all__ = ["Comment"]


@@ -31,10 +27,10 @@ class Comment(Node):
"""Represents a hidden HTML comment, like ``<!-- foobar -->``.""" """Represents a hidden HTML comment, like ``<!-- foobar -->``."""


def __init__(self, contents): def __init__(self, contents):
super(Comment, self).__init__()
super().__init__()
self.contents = contents self.contents = contents


def __unicode__(self):
def __str__(self):
return "<!--" + self.contents + "-->" return "<!--" + self.contents + "-->"


@property @property


+ 5
- 8
mwparserfromhell/nodes/external_link.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from . import Node
from ..compat import str
from ._base import Node
from ..utils import parse_anything from ..utils import parse_anything


__all__ = ["ExternalLink"] __all__ = ["ExternalLink"]
@@ -32,12 +28,12 @@ class ExternalLink(Node):
"""Represents an external link, like ``[http://example.com/ Example]``.""" """Represents an external link, like ``[http://example.com/ Example]``."""


def __init__(self, url, title=None, brackets=True): def __init__(self, url, title=None, brackets=True):
super(ExternalLink, self).__init__()
super().__init__()
self.url = url self.url = url
self.title = title self.title = title
self.brackets = brackets self.brackets = brackets


def __unicode__(self):
def __str__(self):
if self.brackets: if self.brackets:
if self.title is not None: if self.title is not None:
return "[" + str(self.url) + " " + str(self.title) + "]" return "[" + str(self.url) + " " + str(self.title) + "]"
@@ -82,6 +78,7 @@ class ExternalLink(Node):


@url.setter @url.setter
def url(self, value): def url(self, value):
# pylint: disable=import-outside-toplevel
from ..parser import contexts from ..parser import contexts
self._url = parse_anything(value, contexts.EXT_LINK_URI) self._url = parse_anything(value, contexts.EXT_LINK_URI)




+ 0
- 2
mwparserfromhell/nodes/extras/__init__.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy


+ 3
- 7
mwparserfromhell/nodes/extras/attribute.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,9 +18,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from ...compat import str
from ...string_mixin import StringMixIn from ...string_mixin import StringMixIn
from ...utils import parse_anything from ...utils import parse_anything


@@ -38,7 +34,7 @@ class Attribute(StringMixIn):


def __init__(self, name, value=None, quotes='"', pad_first=" ", def __init__(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""): pad_before_eq="", pad_after_eq=""):
super(Attribute, self).__init__()
super().__init__()
self.name = name self.name = name
self._quotes = None self._quotes = None
self.value = value self.value = value
@@ -47,7 +43,7 @@ class Attribute(StringMixIn):
self.pad_before_eq = pad_before_eq self.pad_before_eq = pad_before_eq
self.pad_after_eq = pad_after_eq self.pad_after_eq = pad_after_eq


def __unicode__(self):
def __str__(self):
result = self.pad_first + str(self.name) + self.pad_before_eq result = self.pad_first + str(self.name) + self.pad_before_eq
if self.value is not None: if self.value is not None:
result += "=" + self.pad_after_eq result += "=" + self.pad_after_eq


+ 3
- 7
mwparserfromhell/nodes/extras/parameter.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import re import re


from ...compat import str
from ...string_mixin import StringMixIn from ...string_mixin import StringMixIn
from ...utils import parse_anything from ...utils import parse_anything


@@ -39,12 +35,12 @@ class Parameter(StringMixIn):
""" """


def __init__(self, name, value, showkey=True): def __init__(self, name, value, showkey=True):
super(Parameter, self).__init__()
super().__init__()
self.name = name self.name = name
self.value = value self.value = value
self.showkey = showkey self.showkey = showkey


def __unicode__(self):
def __str__(self):
if self.showkey: if self.showkey:
return str(self.name) + "=" + str(self.value) return str(self.name) + "=" + str(self.value)
return str(self.value) return str(self.value)


+ 4
- 8
mwparserfromhell/nodes/heading.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from . import Node
from ..compat import str
from ._base import Node
from ..utils import parse_anything from ..utils import parse_anything


__all__ = ["Heading"] __all__ = ["Heading"]
@@ -32,11 +28,11 @@ class Heading(Node):
"""Represents a section heading in wikicode, like ``== Foo ==``.""" """Represents a section heading in wikicode, like ``== Foo ==``."""


def __init__(self, title, level): def __init__(self, title, level):
super(Heading, self).__init__()
super().__init__()
self.title = title self.title = title
self.level = level self.level = level


def __unicode__(self):
def __str__(self):
return ("=" * self.level) + str(self.title) + ("=" * self.level) return ("=" * self.level) + str(self.title) + ("=" * self.level)


def __children__(self): def __children__(self):


+ 19
- 47
mwparserfromhell/nodes/html_entity.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import html.entities as htmlentities


from . import Node
from ..compat import htmlentities, py3k, str
from ._base import Node


__all__ = ["HTMLEntity"] __all__ = ["HTMLEntity"]


@@ -31,7 +28,7 @@ class HTMLEntity(Node):
"""Represents an HTML entity, like ``&nbsp;``, either named or unnamed.""" """Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""


def __init__(self, value, named=None, hexadecimal=False, hex_char="x"): def __init__(self, value, named=None, hexadecimal=False, hex_char="x"):
super(HTMLEntity, self).__init__()
super().__init__()
self._value = value self._value = value
if named is None: # Try to guess whether or not the entity is named if named is None: # Try to guess whether or not the entity is named
try: try:
@@ -51,7 +48,7 @@ class HTMLEntity(Node):
self._hexadecimal = hexadecimal self._hexadecimal = hexadecimal
self._hex_char = hex_char self._hex_char = hex_char


def __unicode__(self):
def __str__(self):
if self.named: if self.named:
return "&{};".format(self.value) return "&{};".format(self.value)
if self.hexadecimal: if self.hexadecimal:
@@ -63,32 +60,6 @@ class HTMLEntity(Node):
return self.normalize() return self.normalize()
return self return self


if not py3k:
@staticmethod
def _unichr(value):
"""Implement builtin unichr() with support for non-BMP code points.

On wide Python builds, this functions like the normal unichr(). On
narrow builds, this returns the value's encoded surrogate pair.
"""
try:
return unichr(value)
except ValueError:
# Test whether we're on the wide or narrow Python build. Check
# the length of a non-BMP code point
# (U+1F64A, SPEAK-NO-EVIL MONKEY):
if len("\U0001F64A") == 1: # pragma: no cover
raise
# Ensure this is within the range we can encode:
if value > 0x10FFFF:
raise ValueError("unichr() arg not in range(0x110000)")
code = value - 0x10000
if value < 0: # Invalid code point
raise
lead = 0xD800 + (code >> 10)
trail = 0xDC00 + (code % (1 << 10))
return unichr(lead) + unichr(trail)

@property @property
def value(self): def value(self):
"""The string value of the HTML entity.""" """The string value of the HTML entity."""
@@ -126,21 +97,23 @@ class HTMLEntity(Node):
int(newval) int(newval)
except ValueError: except ValueError:
try: try:
int(newval, 16)
intval = int(newval, 16)
except ValueError: except ValueError:
if newval not in htmlentities.entitydefs: if newval not in htmlentities.entitydefs:
raise ValueError("entity value is not a valid name")
raise ValueError(
"entity value {!r} is not a valid name".format(newval)) from None
self._named = True self._named = True
self._hexadecimal = False self._hexadecimal = False
else: else:
if int(newval, 16) < 0 or int(newval, 16) > 0x10FFFF:
raise ValueError("entity value is not in range(0x110000)")
if intval < 0 or intval > 0x10FFFF:
raise ValueError(
"entity value 0x{:x} is not in range(0x110000)".format(intval)) from None
self._named = False self._named = False
self._hexadecimal = True self._hexadecimal = True
else: else:
test = int(newval, 16 if self.hexadecimal else 10) test = int(newval, 16 if self.hexadecimal else 10)
if test < 0 or test > 0x10FFFF: if test < 0 or test > 0x10FFFF:
raise ValueError("entity value is not in range(0x110000)")
raise ValueError("entity value {} is not in range(0x110000)".format(test))
self._named = False self._named = False
self._value = newval self._value = newval


@@ -148,13 +121,13 @@ class HTMLEntity(Node):
def named(self, newval): def named(self, newval):
newval = bool(newval) newval = bool(newval)
if newval and self.value not in htmlentities.entitydefs: if newval and self.value not in htmlentities.entitydefs:
raise ValueError("entity value is not a valid name")
raise ValueError("entity value {!r} is not a valid name".format(self.value))
if not newval: if not newval:
try: try:
int(self.value, 16) int(self.value, 16)
except ValueError:
err = "current entity value is not a valid Unicode codepoint"
raise ValueError(err)
except ValueError as exc:
raise ValueError("current entity value {!r} is not a valid "
"Unicode codepoint".format(self.value)) from exc
self._named = newval self._named = newval


@hexadecimal.setter @hexadecimal.setter
@@ -173,9 +146,8 @@ class HTMLEntity(Node):


def normalize(self): def normalize(self):
"""Return the unicode character represented by the HTML entity.""" """Return the unicode character represented by the HTML entity."""
chrfunc = chr if py3k else HTMLEntity._unichr
if self.named: if self.named:
return chrfunc(htmlentities.name2codepoint[self.value])
return chr(htmlentities.name2codepoint[self.value])
if self.hexadecimal: if self.hexadecimal:
return chrfunc(int(self.value, 16))
return chrfunc(int(self.value))
return chr(int(self.value, 16))
return chr(int(self.value))

+ 7
- 12
mwparserfromhell/nodes/tag.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,11 +18,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from . import Node
from ._base import Node
from .extras import Attribute from .extras import Attribute
from ..compat import str
from ..definitions import is_visible from ..definitions import is_visible
from ..utils import parse_anything from ..utils import parse_anything


@@ -37,7 +33,7 @@ class Tag(Node):
self_closing=False, invalid=False, implicit=False, padding="", self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None, wiki_style_separator=None, closing_tag=None, wiki_style_separator=None,
closing_wiki_markup=None): closing_wiki_markup=None):
super(Tag, self).__init__()
super().__init__()
self.tag = tag self.tag = tag
self.contents = contents self.contents = contents
self._attrs = attrs if attrs else [] self._attrs = attrs if attrs else []
@@ -53,7 +49,7 @@ class Tag(Node):
if closing_wiki_markup is not None: if closing_wiki_markup is not None:
self.closing_wiki_markup = closing_wiki_markup self.closing_wiki_markup = closing_wiki_markup


def __unicode__(self):
def __str__(self):
if self.wiki_markup: if self.wiki_markup:
if self.attributes: if self.attributes:
attrs = "".join([str(attr) for attr in self.attributes]) attrs = "".join([str(attr) for attr in self.attributes])
@@ -63,10 +59,9 @@ class Tag(Node):
separator = self.wiki_style_separator or "" separator = self.wiki_style_separator or ""
if self.self_closing: if self.self_closing:
return self.wiki_markup + attrs + padding + separator return self.wiki_markup + attrs + padding + separator
else:
close = self.closing_wiki_markup or ""
return self.wiki_markup + attrs + padding + separator + \
str(self.contents) + close
close = self.closing_wiki_markup or ""
return self.wiki_markup + attrs + padding + separator + \
str(self.contents) + close


result = ("</" if self.invalid else "<") + str(self.tag) result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes: if self.attributes:


+ 35
- 22
mwparserfromhell/nodes/template.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,36 +18,37 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
from collections import defaultdict from collections import defaultdict
import re import re


from . import HTMLEntity, Node, Text
from ._base import Node
from .html_entity import HTMLEntity
from .text import Text
from .extras import Parameter from .extras import Parameter
from ..compat import range, str
from ..utils import parse_anything from ..utils import parse_anything


__all__ = ["Template"] __all__ = ["Template"]


FLAGS = re.DOTALL | re.UNICODE FLAGS = re.DOTALL | re.UNICODE
# Used to allow None as a valid fallback value
_UNSET = object()


class Template(Node): class Template(Node):
"""Represents a template in wikicode, like ``{{foo}}``.""" """Represents a template in wikicode, like ``{{foo}}``."""


def __init__(self, name, params=None): def __init__(self, name, params=None):
super(Template, self).__init__()
super().__init__()
self.name = name self.name = name
if params: if params:
self._params = params self._params = params
else: else:
self._params = [] self._params = []


def __unicode__(self):
def __str__(self):
if self.params: if self.params:
params = "|".join([str(param) for param in self.params]) params = "|".join([str(param) for param in self.params])
return "{{" + str(self.name) + "|" + params + "}}" return "{{" + str(self.name) + "|" + params + "}}"
else:
return "{{" + str(self.name) + "}}"
return "{{" + str(self.name) + "}}"


def __children__(self): def __children__(self):
yield self.name yield self.name
@@ -103,6 +102,7 @@ class Template(Node):
confidence = float(best) / sum(values) confidence = float(best) / sum(values)
if confidence > 0.5: if confidence > 0.5:
return tuple(theories.keys())[values.index(best)] return tuple(theories.keys())[values.index(best)]
return None


@staticmethod @staticmethod
def _blank_param_value(value): def _blank_param_value(value):
@@ -211,23 +211,29 @@ class Template(Node):
return True return True
return False return False


has_param = lambda self, name, ignore_empty=False: \
self.has(name, ignore_empty)
has_param.__doc__ = "Alias for :meth:`has`."
def has_param(self, name, ignore_empty=False):
"""Alias for :meth:`has`."""
return self.has(name, ignore_empty)


def get(self, name):
def get(self, name, default=_UNSET):
"""Get the parameter whose name is *name*. """Get the parameter whose name is *name*.


The returned object is a :class:`.Parameter` instance. Raises The returned object is a :class:`.Parameter` instance. Raises
:exc:`ValueError` if no parameter has this name. Since multiple
parameters can have the same name, we'll return the last match, since
the last parameter is the only one read by the MediaWiki parser.
:exc:`ValueError` if no parameter has this name. If *default* is set,
returns that instead. Since multiple parameters can have the same name,
we'll return the last match, since the last parameter is the only one
read by the MediaWiki parser.
""" """
name = str(name).strip() name = str(name).strip()
for param in reversed(self.params): for param in reversed(self.params):
if param.name.strip() == name: if param.name.strip() == name:
return param return param
raise ValueError(name)
if default is _UNSET:
raise ValueError(name)
return default

def __getitem__(self, name):
return self.get(name)


def add(self, name, value, showkey=None, before=None, def add(self, name, value, showkey=None, before=None,
preserve_spacing=True): preserve_spacing=True):
@@ -309,6 +315,9 @@ class Template(Node):
self.params.append(param) self.params.append(param)
return param return param


def __setitem__(self, name, value):
return self.add(name, value)

def remove(self, param, keep_field=False): def remove(self, param, keep_field=False):
"""Remove a parameter from the template, identified by *param*. """Remove a parameter from the template, identified by *param*.


@@ -330,19 +339,20 @@ class Template(Node):
hidden name, if it exists, or the first instance). hidden name, if it exists, or the first instance).
""" """
if isinstance(param, Parameter): if isinstance(param, Parameter):
return self._remove_exact(param, keep_field)
self._remove_exact(param, keep_field)
return


name = str(param).strip() name = str(param).strip()
removed = False removed = False
to_remove = [] to_remove = []


for i, param in enumerate(self.params):
if param.name.strip() == name:
for i, par in enumerate(self.params):
if par.name.strip() == name:
if keep_field: if keep_field:
if self._should_remove(i, name): if self._should_remove(i, name):
to_remove.append(i) to_remove.append(i)
else: else:
self._blank_param_value(param.value)
self._blank_param_value(par.value)
keep_field = False keep_field = False
else: else:
self._fix_dependendent_params(i) self._fix_dependendent_params(i)
@@ -354,3 +364,6 @@ class Template(Node):
raise ValueError(name) raise ValueError(name)
for i in reversed(to_remove): for i in reversed(to_remove):
self.params.pop(i) self.params.pop(i)

def __delitem__(self, param):
return self.remove(param)

+ 4
- 8
mwparserfromhell/nodes/text.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from . import Node
from ..compat import str
from ._base import Node


__all__ = ["Text"] __all__ = ["Text"]


@@ -31,10 +27,10 @@ class Text(Node):
"""Represents ordinary, unformatted text with no special properties.""" """Represents ordinary, unformatted text with no special properties."""


def __init__(self, value): def __init__(self, value):
super(Text, self).__init__()
super().__init__()
self.value = value self.value = value


def __unicode__(self):
def __str__(self):
return self.value return self.value


def __strip__(self, **kwargs): def __strip__(self, **kwargs):


+ 4
- 8
mwparserfromhell/nodes/wikilink.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from . import Node
from ..compat import str
from ._base import Node
from ..utils import parse_anything from ..utils import parse_anything


__all__ = ["Wikilink"] __all__ = ["Wikilink"]
@@ -32,11 +28,11 @@ class Wikilink(Node):
"""Represents an internal wikilink, like ``[[Foo|Bar]]``.""" """Represents an internal wikilink, like ``[[Foo|Bar]]``."""


def __init__(self, title, text=None): def __init__(self, title, text=None):
super(Wikilink, self).__init__()
super().__init__()
self.title = title self.title = title
self.text = text self.text = text


def __unicode__(self):
def __str__(self):
if self.text is not None: if self.text is not None:
return "[[" + str(self.title) + "|" + str(self.text) + "]]" return "[[" + str(self.title) + "|" + str(self.text) + "]]"
return "[[" + str(self.title) + "]]" return "[[" + str(self.title) + "]]"


+ 3
- 17
mwparserfromhell/parser/__init__.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -26,20 +24,8 @@ modules: the :mod:`.tokenizer` and the :mod:`.builder`. This module joins them
together into one interface. together into one interface.
""" """


class ParserError(Exception):
"""Exception raised when an internal error occurs while parsing.

This does not mean that the wikicode was invalid, because invalid markup
should still be parsed correctly. This means that the parser caught itself
with an impossible internal state and is bailing out before other problems
can happen. Its appearance indicates a bug.
"""
def __init__(self, extra):
msg = "This is a bug and should be reported. Info: {}.".format(extra)
super(ParserError, self).__init__(msg)


from .builder import Builder from .builder import Builder
from .errors import ParserError
try: try:
from ._tokenizer import CTokenizer from ._tokenizer import CTokenizer
use_c = True use_c = True
@@ -50,7 +36,7 @@ except ImportError:


__all__ = ["use_c", "Parser", "ParserError"] __all__ = ["use_c", "Parser", "ParserError"]


class Parser(object):
class Parser:
"""Represents a parser for wikicode. """Represents a parser for wikicode.


Actual parsing is a two-step process: first, the text is split up into a Actual parsing is a two-step process: first, the text is split up into a


+ 7
- 12
mwparserfromhell/parser/builder.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals


from . import tokens, ParserError
from ..compat import str
from . import tokens
from .errors import ParserError
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
Template, Text, Wikilink) Template, Text, Wikilink)
from ..nodes.extras import Attribute, Parameter from ..nodes.extras import Attribute, Parameter
@@ -45,7 +42,7 @@ def _add_handler(token_type):
return decorator return decorator




class Builder(object):
class Builder:
"""Builds a tree of nodes out of a sequence of tokens. """Builds a tree of nodes out of a sequence of tokens.


To use, pass a list of :class:`.Token`\\ s to the :meth:`build` method. The To use, pass a list of :class:`.Token`\\ s to the :meth:`build` method. The
@@ -201,8 +198,7 @@ class Builder(object):
if isinstance(token, tokens.HeadingEnd): if isinstance(token, tokens.HeadingEnd):
title = self._pop() title = self._pop()
return Heading(title, level) return Heading(title, level)
else:
self._write(self._handle_token(token))
self._write(self._handle_token(token))
raise ParserError("_handle_heading() missed a close token") raise ParserError("_handle_heading() missed a close token")


@_add_handler(tokens.CommentStart) @_add_handler(tokens.CommentStart)
@@ -214,8 +210,7 @@ class Builder(object):
if isinstance(token, tokens.CommentEnd): if isinstance(token, tokens.CommentEnd):
contents = self._pop() contents = self._pop()
return Comment(contents) return Comment(contents)
else:
self._write(self._handle_token(token))
self._write(self._handle_token(token))
raise ParserError("_handle_comment() missed a close token") raise ParserError("_handle_comment() missed a close token")


def _handle_attribute(self, start): def _handle_attribute(self, start):
@@ -286,7 +281,7 @@ class Builder(object):
return _HANDLERS[type(token)](self, token) return _HANDLERS[type(token)](self, token)
except KeyError: except KeyError:
err = "_handle_token() got unexpected {0}" err = "_handle_token() got unexpected {0}"
raise ParserError(err.format(type(token).__name__))
raise ParserError(err.format(type(token).__name__)) from None


def build(self, tokenlist): def build(self, tokenlist):
"""Build a Wikicode object from a list tokens and return it.""" """Build a Wikicode object from a list tokens and return it."""


+ 0
- 2
mwparserfromhell/parser/contexts.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> # Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy


+ 1
- 25
mwparserfromhell/parser/ctokenizer/common.h View File

@@ -23,7 +23,7 @@ SOFTWARE.
#pragma once #pragma once


#ifndef PY_SSIZE_T_CLEAN #ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html
#endif #endif


#include <Python.h> #include <Python.h>
@@ -34,10 +34,6 @@ SOFTWARE.


/* Compatibility macros */ /* Compatibility macros */


#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif

#ifndef uint64_t #ifndef uint64_t
#define uint64_t unsigned PY_LONG_LONG #define uint64_t unsigned PY_LONG_LONG
#endif #endif
@@ -48,20 +44,8 @@ SOFTWARE.


/* Unicode support macros */ /* Unicode support macros */


#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3
#define PEP_393
#endif

#ifdef PEP_393
#define Unicode Py_UCS4
#define PyUnicode_FROM_SINGLE(chr) \ #define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)
#else
#define Unicode Py_UNICODE
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromUnicode(&(chr), 1)
#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
#endif


/* Error handling macros */ /* Error handling macros */


@@ -85,13 +69,9 @@ extern PyObject* definitions;
typedef struct { typedef struct {
Py_ssize_t capacity; Py_ssize_t capacity;
Py_ssize_t length; Py_ssize_t length;
#ifdef PEP_393
PyObject* object; PyObject* object;
int kind; int kind;
void* data; void* data;
#else
Py_UNICODE* data;
#endif
} Textbuffer; } Textbuffer;


typedef struct { typedef struct {
@@ -111,12 +91,8 @@ typedef struct Stack Stack;
typedef struct { typedef struct {
PyObject* object; /* base PyUnicodeObject object */ PyObject* object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */ Py_ssize_t length; /* length of object, in code points */
#ifdef PEP_393
int kind; /* object's kind value */ int kind; /* object's kind value */
void* data; /* object's raw unicode buffer */ void* data; /* object's raw unicode buffer */
#else
Py_UNICODE* buf; /* object's internal buffer */
#endif
} TokenizerInput; } TokenizerInput;


typedef struct avl_tree_node avl_tree; typedef struct avl_tree_node avl_tree;


+ 62
- 12
mwparserfromhell/parser/ctokenizer/definitions.c View File

@@ -1,5 +1,5 @@
/* /*
Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>


Permission is hereby granted, free of charge, to any person obtaining a copy of Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in this software and associated documentation files (the "Software"), to deal in
@@ -28,29 +28,79 @@ SOFTWARE.
*/ */


static const char* URI_SCHEMES[] = { static const char* URI_SCHEMES[] = {
"http", "https", "ftp", "ftps", "ssh", "sftp", "irc", "ircs", "xmpp",
"sip", "sips", "gopher", "telnet", "nntp", "worldwind", "mailto", "tel",
"sms", "news", "svn", "git", "mms", "bitcoin", "magnet", "urn", "geo", NULL
"bitcoin",
"ftp",
"ftps",
"geo",
"git",
"gopher",
"http",
"https",
"irc",
"ircs",
"magnet",
"mailto",
"mms",
"news",
"nntp",
"redis",
"sftp",
"sip",
"sips",
"sms",
"ssh",
"svn",
"tel",
"telnet",
"urn",
"worldwind",
"xmpp",
NULL,
}; };


static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = { static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
"xmpp", "sip", "sips", "mailto", "tel", "sms", "news", "bitcoin", "magnet",
"urn", "geo", NULL
"bitcoin",
"geo",
"magnet",
"mailto",
"news",
"sip",
"sips",
"sms",
"tel",
"urn",
"xmpp",
NULL,
}; };


static const char* PARSER_BLACKLIST[] = { static const char* PARSER_BLACKLIST[] = {
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
"nowiki", "pre", "score", "section", "source", "syntaxhighlight",
"templatedata", "timeline", NULL
"categorytree",
"ce",
"chem",
"gallery",
"graph",
"hiero",
"imagemap",
"inputbox",
"math",
"nowiki",
"pre",
"score",
"section",
"source",
"syntaxhighlight",
"templatedata",
"timeline",
NULL,
}; };


static const char* SINGLE[] = { static const char* SINGLE[] = {
"br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr",
"wbr", NULL
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td",
"tr", NULL
}; };


static const char* SINGLE_ONLY[] = { static const char* SINGLE_ONLY[] = {
"br", "hr", "meta", "link", "img", "wbr", NULL
"br", "wbr", "hr", "meta", "link", "img", NULL
}; };


/* /*


+ 1
- 1
mwparserfromhell/parser/ctokenizer/tag_data.h View File

@@ -32,7 +32,7 @@ typedef struct {
Textbuffer* pad_first; Textbuffer* pad_first;
Textbuffer* pad_before_eq; Textbuffer* pad_before_eq;
Textbuffer* pad_after_eq; Textbuffer* pad_after_eq;
Unicode quoter;
Py_UCS4 quoter;
Py_ssize_t reset; Py_ssize_t reset;
} TagData; } TagData;




+ 6
- 49
mwparserfromhell/parser/ctokenizer/textbuffer.c View File

@@ -29,23 +29,16 @@ SOFTWARE.
/* /*
Internal allocation function for textbuffers. Internal allocation function for textbuffers.
*/ */
static int internal_alloc(Textbuffer* self, Unicode maxchar)
static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar)
{ {
self->capacity = INITIAL_CAPACITY; self->capacity = INITIAL_CAPACITY;
self->length = 0; self->length = 0;


#ifdef PEP_393
self->object = PyUnicode_New(self->capacity, maxchar); self->object = PyUnicode_New(self->capacity, maxchar);
if (!self->object) if (!self->object)
return -1; return -1;
self->kind = PyUnicode_KIND(self->object); self->kind = PyUnicode_KIND(self->object);
self->data = PyUnicode_DATA(self->object); self->data = PyUnicode_DATA(self->object);
#else
(void) maxchar; // Unused
self->data = malloc(sizeof(Unicode) * self->capacity);
if (!self->data)
return -1;
#endif


return 0; return 0;
} }
@@ -55,11 +48,7 @@ static int internal_alloc(Textbuffer* self, Unicode maxchar)
*/ */
static void internal_dealloc(Textbuffer* self) static void internal_dealloc(Textbuffer* self)
{ {
#ifdef PEP_393
Py_DECREF(self->object); Py_DECREF(self->object);
#else
free(self->data);
#endif
} }


/* /*
@@ -67,7 +56,6 @@ static void internal_dealloc(Textbuffer* self)
*/ */
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
{ {
#ifdef PEP_393
PyObject *newobj; PyObject *newobj;
void *newdata; void *newdata;


@@ -79,10 +67,6 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
Py_DECREF(self->object); Py_DECREF(self->object);
self->object = newobj; self->object = newobj;
self->data = newdata; self->data = newdata;
#else
if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap)))
return -1;
#endif


self->capacity = new_cap; self->capacity = new_cap;
return 0; return 0;
@@ -94,11 +78,9 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
Textbuffer* Textbuffer_new(TokenizerInput* text) Textbuffer* Textbuffer_new(TokenizerInput* text)
{ {
Textbuffer* self = malloc(sizeof(Textbuffer)); Textbuffer* self = malloc(sizeof(Textbuffer));
Unicode maxchar = 0;
Py_UCS4 maxchar = 0;


#ifdef PEP_393
maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);
#endif


if (!self) if (!self)
goto fail_nomem; goto fail_nomem;
@@ -127,11 +109,9 @@ void Textbuffer_dealloc(Textbuffer* self)
*/ */
int Textbuffer_reset(Textbuffer* self) int Textbuffer_reset(Textbuffer* self)
{ {
Unicode maxchar = 0;
Py_UCS4 maxchar = 0;


#ifdef PEP_393
maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);
#endif


internal_dealloc(self); internal_dealloc(self);
if (internal_alloc(self, maxchar)) if (internal_alloc(self, maxchar))
@@ -142,18 +122,14 @@ int Textbuffer_reset(Textbuffer* self)
/* /*
Write a Unicode codepoint to the given textbuffer. Write a Unicode codepoint to the given textbuffer.
*/ */
int Textbuffer_write(Textbuffer* self, Unicode code)
int Textbuffer_write(Textbuffer* self, Py_UCS4 code)
{ {
if (self->length >= self->capacity) { if (self->length >= self->capacity) {
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
return -1; return -1;
} }


#ifdef PEP_393
PyUnicode_WRITE(self->kind, self->data, self->length++, code); PyUnicode_WRITE(self->kind, self->data, self->length++, code);
#else
self->data[self->length++] = code;
#endif


return 0; return 0;
} }
@@ -163,13 +139,9 @@ int Textbuffer_write(Textbuffer* self, Unicode code)


This function does not check for bounds. This function does not check for bounds.
*/ */
Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
Py_UCS4 Textbuffer_read(Textbuffer* self, Py_ssize_t index)
{ {
#ifdef PEP_393
return PyUnicode_READ(self->kind, self->data, index); return PyUnicode_READ(self->kind, self->data, index);
#else
return self->data[index];
#endif
} }


/* /*
@@ -177,11 +149,7 @@ Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
*/ */
PyObject* Textbuffer_render(Textbuffer* self) PyObject* Textbuffer_render(Textbuffer* self)
{ {
#ifdef PEP_393
return PyUnicode_FromKindAndData(self->kind, self->data, self->length); return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
#else
return PyUnicode_FromUnicode(self->data, self->length);
#endif
} }


/* /*
@@ -196,14 +164,9 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
return -1; return -1;
} }


#ifdef PEP_393
assert(self->kind == other->kind); assert(self->kind == other->kind);
memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data, memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
other->length * other->kind); other->length * other->kind);
#else
memcpy(self->data + self->length, other->data,
other->length * sizeof(Unicode));
#endif


self->length = newlen; self->length = newlen;
return 0; return 0;
@@ -215,18 +178,12 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
void Textbuffer_reverse(Textbuffer* self) void Textbuffer_reverse(Textbuffer* self)
{ {
Py_ssize_t i, end = self->length - 1; Py_ssize_t i, end = self->length - 1;
Unicode tmp;
Py_UCS4 tmp;


for (i = 0; i < self->length / 2; i++) { for (i = 0; i < self->length / 2; i++) {
#ifdef PEP_393
tmp = PyUnicode_READ(self->kind, self->data, i); tmp = PyUnicode_READ(self->kind, self->data, i);
PyUnicode_WRITE(self->kind, self->data, i, PyUnicode_WRITE(self->kind, self->data, i,
PyUnicode_READ(self->kind, self->data, end - i)); PyUnicode_READ(self->kind, self->data, end - i));
PyUnicode_WRITE(self->kind, self->data, end - i, tmp); PyUnicode_WRITE(self->kind, self->data, end - i, tmp);
#else
tmp = self->data[i];
self->data[i] = self->data[end - i];
self->data[end - i] = tmp;
#endif
} }
} }

+ 2
- 2
mwparserfromhell/parser/ctokenizer/textbuffer.h View File

@@ -29,8 +29,8 @@ SOFTWARE.
Textbuffer* Textbuffer_new(TokenizerInput*); Textbuffer* Textbuffer_new(TokenizerInput*);
void Textbuffer_dealloc(Textbuffer*); void Textbuffer_dealloc(Textbuffer*);
int Textbuffer_reset(Textbuffer*); int Textbuffer_reset(Textbuffer*);
int Textbuffer_write(Textbuffer*, Unicode);
Unicode Textbuffer_read(Textbuffer*, Py_ssize_t);
int Textbuffer_write(Textbuffer*, Py_UCS4);
Py_UCS4 Textbuffer_read(Textbuffer*, Py_ssize_t);
PyObject* Textbuffer_render(Textbuffer*); PyObject* Textbuffer_render(Textbuffer*);
int Textbuffer_concat(Textbuffer*, Textbuffer*); int Textbuffer_concat(Textbuffer*, Textbuffer*);
void Textbuffer_reverse(Textbuffer*); void Textbuffer_reverse(Textbuffer*);

+ 26
- 30
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -52,7 +52,7 @@ static int Tokenizer_parse_tag(Tokenizer*);
/* /*
Determine whether the given code point is a marker. Determine whether the given code point is a marker.
*/ */
static int is_marker(Unicode this)
static int is_marker(Py_UCS4 this)
{ {
int i; int i;


@@ -442,7 +442,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
static const char* valid = URISCHEME; static const char* valid = URISCHEME;
Textbuffer* buffer; Textbuffer* buffer;
PyObject* scheme; PyObject* scheme;
Unicode this;
Py_UCS4 this;
int slashes, i; int slashes, i;


if (Tokenizer_check_route(self, LC_EXT_LINK_URI) < 0) if (Tokenizer_check_route(self, LC_EXT_LINK_URI) < 0)
@@ -463,7 +463,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
while (1) { while (1) {
if (!valid[i]) if (!valid[i])
goto end_of_loop; goto end_of_loop;
if (this == (Unicode) valid[i])
if (this == (Py_UCS4) valid[i])
break; break;
i++; i++;
} }
@@ -516,7 +516,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
static const char* valid = URISCHEME; static const char* valid = URISCHEME;
Textbuffer *scheme_buffer = Textbuffer_new(&self->text); Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
PyObject *scheme; PyObject *scheme;
Unicode chunk;
Py_UCS4 chunk;
Py_ssize_t i; Py_ssize_t i;
int slashes, j; int slashes, j;
uint64_t new_context; uint64_t new_context;
@@ -536,7 +536,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
FAIL_ROUTE(0); FAIL_ROUTE(0);
return 0; return 0;
} }
} while (chunk != (Unicode) valid[j++]);
} while (chunk != (Py_UCS4) valid[j++]);
Textbuffer_write(scheme_buffer, chunk); Textbuffer_write(scheme_buffer, chunk);
} }
end_of_loop: end_of_loop:
@@ -580,7 +580,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
Handle text in a free external link, including trailing punctuation. Handle text in a free external link, including trailing punctuation.
*/ */
static int Tokenizer_handle_free_link_text( static int Tokenizer_handle_free_link_text(
Tokenizer* self, int* parens, Textbuffer* tail, Unicode this)
Tokenizer* self, int* parens, Textbuffer* tail, Py_UCS4 this)
{ {
#define PUSH_TAIL_BUFFER(tail, error) \ #define PUSH_TAIL_BUFFER(tail, error) \
if (tail && tail->length > 0) { \ if (tail && tail->length > 0) { \
@@ -607,10 +607,10 @@ static int Tokenizer_handle_free_link_text(
Return whether the current head is the end of a free link. Return whether the current head is the end of a free link.
*/ */
static int static int
Tokenizer_is_free_link(Tokenizer* self, Unicode this, Unicode next)
Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
{ {
// Built from Tokenizer_parse()'s end sentinels: // Built from Tokenizer_parse()'s end sentinels:
Unicode after = Tokenizer_read(self, 2);
Py_UCS4 after = Tokenizer_read(self, 2);
uint64_t ctx = self->topstack->context; uint64_t ctx = self->topstack->context;


return (!this || this == '\n' || this == '[' || this == ']' || return (!this || this == '\n' || this == '[' || this == ']' ||
@@ -628,7 +628,7 @@ static PyObject*
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
Textbuffer* extra) Textbuffer* extra)
{ {
Unicode this, next;
Py_UCS4 this, next;
int parens = 0; int parens = 0;


if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) :
@@ -816,11 +816,7 @@ static int Tokenizer_parse_heading(Tokenizer* self)
if (!heading) { if (!heading) {
return -1; return -1;
} }
#ifdef IS_PY3K
level = PyLong_FromSsize_t(heading->level); level = PyLong_FromSsize_t(heading->level);
#else
level = PyInt_FromSsize_t(heading->level);
#endif
if (!level) { if (!level) {
Py_DECREF(heading->title); Py_DECREF(heading->title);
free(heading); free(heading);
@@ -933,7 +929,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
static int Tokenizer_really_parse_entity(Tokenizer* self) static int Tokenizer_really_parse_entity(Tokenizer* self)
{ {
PyObject *kwargs, *charobj, *textobj; PyObject *kwargs, *charobj, *textobj;
Unicode this;
Py_UCS4 this;
int numeric, hexadecimal, i, j, zeroes, test; int numeric, hexadecimal, i, j, zeroes, test;
char *valid, *text, *buffer, *def; char *valid, *text, *buffer, *def;


@@ -1014,7 +1010,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
while (1) { while (1) {
if (!valid[j]) if (!valid[j])
FAIL_ROUTE_AND_EXIT() FAIL_ROUTE_AND_EXIT()
if (this == (Unicode) valid[j])
if (this == (Py_UCS4) valid[j])
break; break;
j++; j++;
} }
@@ -1111,7 +1107,7 @@ static int Tokenizer_parse_comment(Tokenizer* self)
{ {
Py_ssize_t reset = self->head + 3; Py_ssize_t reset = self->head + 3;
PyObject *comment; PyObject *comment;
Unicode this;
Py_UCS4 this;


self->head += 4; self->head += 4;
if (Tokenizer_push(self, 0)) if (Tokenizer_push(self, 0))
@@ -1211,7 +1207,7 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
Handle whitespace inside of an HTML open tag. Handle whitespace inside of an HTML open tag.
*/ */
static int Tokenizer_handle_tag_space( static int Tokenizer_handle_tag_space(
Tokenizer* self, TagData* data, Unicode text)
Tokenizer* self, TagData* data, Py_UCS4 text)
{ {
uint64_t ctx = data->context; uint64_t ctx = data->context;
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
@@ -1243,9 +1239,9 @@ static int Tokenizer_handle_tag_space(
/* /*
Handle regular text inside of an HTML open tag. Handle regular text inside of an HTML open tag.
*/ */
static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text)
static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UCS4 text)
{ {
Unicode next = Tokenizer_read(self, 1);
Py_UCS4 next = Tokenizer_read(self, 1);


if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self)) if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self))
return Tokenizer_emit_char(self, text); return Tokenizer_emit_char(self, text);
@@ -1262,7 +1258,7 @@ static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text)
Handle all sorts of text data inside of an HTML open tag. Handle all sorts of text data inside of an HTML open tag.
*/ */
static int Tokenizer_handle_tag_data( static int Tokenizer_handle_tag_data(
Tokenizer* self, TagData* data, Unicode chunk)
Tokenizer* self, TagData* data, Py_UCS4 chunk)
{ {
PyObject *trash; PyObject *trash;
int first_time, escaped; int first_time, escaped;
@@ -1444,7 +1440,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
{ {
Textbuffer* buffer; Textbuffer* buffer;
PyObject *buf_tmp, *end_tag, *start_tag; PyObject *buf_tmp, *end_tag, *start_tag;
Unicode this, next;
Py_UCS4 this, next;
Py_ssize_t reset; Py_ssize_t reset;
int cmp; int cmp;


@@ -1600,7 +1596,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
{ {
TagData *data = TagData_new(&self->text); TagData *data = TagData_new(&self->text);
PyObject *token, *text, *trash; PyObject *token, *text, *trash;
Unicode this, next;
Py_UCS4 this, next;
int can_exit; int can_exit;


if (!data) if (!data)
@@ -1686,7 +1682,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
Py_ssize_t reset = self->head + 1, pos = 0; Py_ssize_t reset = self->head + 1, pos = 0;
Textbuffer* buf; Textbuffer* buf;
PyObject *name, *tag; PyObject *name, *tag;
Unicode this;
Py_UCS4 this;


self->head += 2; self->head += 2;
buf = Textbuffer_new(&self->text); buf = Textbuffer_new(&self->text);
@@ -1988,7 +1984,7 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self)
static int Tokenizer_handle_list_marker(Tokenizer* self) static int Tokenizer_handle_list_marker(Tokenizer* self)
{ {
PyObject *kwargs, *markup; PyObject *kwargs, *markup;
Unicode code = Tokenizer_read(self, 0);
Py_UCS4 code = Tokenizer_read(self, 0);


if (code == ';') if (code == ';')
self->topstack->context |= LC_DLTERM; self->topstack->context |= LC_DLTERM;
@@ -2015,7 +2011,7 @@ static int Tokenizer_handle_list_marker(Tokenizer* self)
*/ */
static int Tokenizer_handle_list(Tokenizer* self) static int Tokenizer_handle_list(Tokenizer* self)
{ {
Unicode marker = Tokenizer_read(self, 1);
Py_UCS4 marker = Tokenizer_read(self, 1);


if (Tokenizer_handle_list_marker(self)) if (Tokenizer_handle_list_marker(self))
return -1; return -1;
@@ -2169,11 +2165,11 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
/* /*
Handle style attributes for a table until an ending token. Handle style attributes for a table until an ending token.
*/ */
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token)
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Py_UCS4 end_token)
{ {
TagData *data = TagData_new(&self->text); TagData *data = TagData_new(&self->text);
PyObject *padding, *trash; PyObject *padding, *trash;
Unicode this;
Py_UCS4 this;
int can_exit; int can_exit;


if (!data) if (!data)
@@ -2483,7 +2479,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
everything is safe, or -1 if the route must be failed. everything is safe, or -1 if the route must be failed.
*/ */
static int static int
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data)
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UCS4 data)
{ {
if (context & LC_FAIL_NEXT) if (context & LC_FAIL_NEXT)
return -1; return -1;
@@ -2568,7 +2564,7 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data)
static int Tokenizer_has_leading_whitespace(Tokenizer* self) static int Tokenizer_has_leading_whitespace(Tokenizer* self)
{ {
int offset = 1; int offset = 1;
Unicode current_character;
Py_UCS4 current_character;
while (1) { while (1) {
current_character = Tokenizer_read_backwards(self, offset); current_character = Tokenizer_read_backwards(self, offset);
if (!current_character || current_character == '\n') if (!current_character || current_character == '\n')
@@ -2586,7 +2582,7 @@ static int Tokenizer_has_leading_whitespace(Tokenizer* self)
PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
{ {
uint64_t this_context; uint64_t this_context;
Unicode this, next, next_next, last;
Py_UCS4 this, next, next_next, last;
PyObject* temp; PyObject* temp;


if (push) { if (push) {


+ 1
- 1
mwparserfromhell/parser/ctokenizer/tok_parse.h View File

@@ -24,7 +24,7 @@ SOFTWARE.


#include "common.h" #include "common.h"


static const Unicode MARKERS[] = {
static const Py_UCS4 MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '!', '\n', '\0'}; '-', '!', '\n', '\0'};




+ 4
- 8
mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -275,7 +275,7 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
/* /*
Write a Unicode codepoint to the current textbuffer. Write a Unicode codepoint to the current textbuffer.
*/ */
int Tokenizer_emit_char(Tokenizer* self, Unicode code)
int Tokenizer_emit_char(Tokenizer* self, Py_UCS4 code)
{ {
return Textbuffer_write(self->topstack->textbuffer, code); return Textbuffer_write(self->topstack->textbuffer, code);
} }
@@ -389,19 +389,15 @@ int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
/* /*
Internal function to read the codepoint at the given index from the input. Internal function to read the codepoint at the given index from the input.
*/ */
static Unicode read_codepoint(TokenizerInput* text, Py_ssize_t index)
static Py_UCS4 read_codepoint(TokenizerInput* text, Py_ssize_t index)
{ {
#ifdef PEP_393
return PyUnicode_READ(text->kind, text->data, index); return PyUnicode_READ(text->kind, text->data, index);
#else
return text->buf[index];
#endif
} }


/* /*
Read the value at a relative point in the wikicode, forwards. Read the value at a relative point in the wikicode, forwards.
*/ */
Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
Py_UCS4 Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
{ {
Py_ssize_t index = self->head + delta; Py_ssize_t index = self->head + delta;


@@ -413,7 +409,7 @@ Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
/* /*
Read the value at a relative point in the wikicode, backwards. Read the value at a relative point in the wikicode, backwards.
*/ */
Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
Py_UCS4 Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{ {
Py_ssize_t index; Py_ssize_t index;




+ 3
- 3
mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -38,14 +38,14 @@ void Tokenizer_free_bad_route_tree(Tokenizer*);


int Tokenizer_emit_token(Tokenizer*, PyObject*, int); int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
int Tokenizer_emit_char(Tokenizer*, Unicode);
int Tokenizer_emit_char(Tokenizer*, Py_UCS4);
int Tokenizer_emit_text(Tokenizer*, const char*); int Tokenizer_emit_text(Tokenizer*, const char*);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*); int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
int Tokenizer_emit_all(Tokenizer*, PyObject*); int Tokenizer_emit_all(Tokenizer*, PyObject*);
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*); int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);


Unicode Tokenizer_read(Tokenizer*, Py_ssize_t);
Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
Py_UCS4 Tokenizer_read(Tokenizer*, Py_ssize_t);
Py_UCS4 Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);


/* Macros */ /* Macros */




+ 9
- 25
mwparserfromhell/parser/ctokenizer/tokenizer.c View File

@@ -85,12 +85,8 @@ static void init_tokenizer_text(TokenizerInput* text)
text->object = Py_None; text->object = Py_None;
Py_INCREF(Py_None); Py_INCREF(Py_None);
text->length = 0; text->length = 0;
#ifdef PEP_393
text->kind = PyUnicode_WCHAR_KIND; text->kind = PyUnicode_WCHAR_KIND;
text->data = NULL; text->data = NULL;
#else
text->buf = NULL;
#endif
} }


/* /*
@@ -119,14 +115,10 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
dealloc_tokenizer_text(text); dealloc_tokenizer_text(text);
text->object = input; text->object = input;


#ifdef PEP_393
if (PyUnicode_READY(input) < 0) if (PyUnicode_READY(input) < 0)
return -1; return -1;
text->kind = PyUnicode_KIND(input); text->kind = PyUnicode_KIND(input);
text->data = PyUnicode_DATA(input); text->data = PyUnicode_DATA(input);
#else
text->buf = PyUnicode_AS_UNICODE(input);
#endif
text->length = PyUnicode_GET_LENGTH(input); text->length = PyUnicode_GET_LENGTH(input);
return 0; return 0;
} }
@@ -192,11 +184,9 @@ static int load_entities(void)
{ {
PyObject *tempmod, *defmap, *deflist; PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i; unsigned numdefs, i;
#ifdef IS_PY3K
PyObject *string; PyObject *string;
#endif


tempmod = PyImport_ImportModule(ENTITYDEFS_MODULE);
tempmod = PyImport_ImportModule("html.entities");
if (!tempmod) if (!tempmod)
return -1; return -1;
defmap = PyObject_GetAttrString(tempmod, "entitydefs"); defmap = PyObject_GetAttrString(tempmod, "entitydefs");
@@ -212,14 +202,10 @@ static int load_entities(void)
if (!entitydefs) if (!entitydefs)
return -1; return -1;
for (i = 0; i < numdefs; i++) { for (i = 0; i < numdefs; i++) {
#ifdef IS_PY3K
string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i)); string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i));
if (!string) if (!string)
return -1; return -1;
entitydefs[i] = PyBytes_AsString(string); entitydefs[i] = PyBytes_AsString(string);
#else
entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i));
#endif
if (!entitydefs[i]) if (!entitydefs[i])
return -1; return -1;
} }
@@ -233,7 +219,7 @@ static int load_tokens(void)
*globals = PyEval_GetGlobals(), *globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("tokens");
*modname = PyUnicode_FromString("tokens");
char *name = "mwparserfromhell.parser"; char *name = "mwparserfromhell.parser";


if (!fromlist || !modname) if (!fromlist || !modname)
@@ -256,7 +242,7 @@ static int load_defs(void)
*globals = PyEval_GetGlobals(), *globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("definitions");
*modname = PyUnicode_FromString("definitions");
char *name = "mwparserfromhell"; char *name = "mwparserfromhell";


if (!fromlist || !modname) if (!fromlist || !modname)
@@ -277,7 +263,7 @@ static int load_exceptions(void)
*globals = PyEval_GetGlobals(), *globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("parser");
*modname = PyUnicode_FromString("parser");
char *name = "mwparserfromhell"; char *name = "mwparserfromhell";


if (!fromlist || !modname) if (!fromlist || !modname)
@@ -294,24 +280,22 @@ static int load_exceptions(void)
return 0; return 0;
} }


PyMODINIT_FUNC INIT_FUNC_NAME(void)
PyMODINIT_FUNC PyInit__tokenizer(void)
{ {
PyObject *module; PyObject *module;


TokenizerType.tp_new = PyType_GenericNew; TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0) if (PyType_Ready(&TokenizerType) < 0)
INIT_ERROR;
module = CREATE_MODULE;
return NULL;
module = PyModule_Create(&module_def);
if (!module) if (!module)
INIT_ERROR;
return NULL;
Py_INCREF(&TokenizerType); Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
Py_INCREF(Py_True); Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
NOARGS = PyTuple_New(0); NOARGS = PyTuple_New(0);
if (!NOARGS || load_entities() || load_tokens() || load_defs()) if (!NOARGS || load_entities() || load_tokens() || load_defs())
INIT_ERROR;
#ifdef IS_PY3K
return NULL;
return module; return module;
#endif
} }

+ 0
- 18
mwparserfromhell/parser/ctokenizer/tokenizer.h View File

@@ -32,22 +32,6 @@ static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);


/* Compatibility macros */

#ifdef IS_PY3K
#define IMPORT_NAME_FUNC PyUnicode_FromString
#define CREATE_MODULE PyModule_Create(&module_def);
#define ENTITYDEFS_MODULE "html.entities"
#define INIT_FUNC_NAME PyInit__tokenizer
#define INIT_ERROR return NULL
#else
#define IMPORT_NAME_FUNC PyBytes_FromString
#define CREATE_MODULE Py_InitModule("_tokenizer", NULL);
#define ENTITYDEFS_MODULE "htmlentitydefs"
#define INIT_FUNC_NAME init_tokenizer
#define INIT_ERROR return
#endif

/* Structs */ /* Structs */


static PyMethodDef Tokenizer_methods[] = { static PyMethodDef Tokenizer_methods[] = {
@@ -101,11 +85,9 @@ static PyTypeObject TokenizerType = {
Tokenizer_new, /* tp_new */ Tokenizer_new, /* tp_new */
}; };


#ifdef IS_PY3K
static PyModuleDef module_def = { static PyModuleDef module_def = {
PyModuleDef_HEAD_INIT, PyModuleDef_HEAD_INIT,
"_tokenizer", "_tokenizer",
"Creates a list of tokens from a string of wikicode.", "Creates a list of tokens from a string of wikicode.",
-1, NULL, NULL, NULL, NULL, NULL -1, NULL, NULL, NULL, NULL, NULL
}; };
#endif

+ 33
- 0
mwparserfromhell/parser/errors.py View File

@@ -0,0 +1,33 @@
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

__all__ = ["ParserError"]

class ParserError(Exception):
"""Exception raised when an internal error occurs while parsing.

This does not mean that the wikicode was invalid, because invalid markup
should still be parsed correctly. This means that the parser caught itself
with an impossible internal state and is bailing out before other problems
can happen. Its appearance indicates a bug.
"""
def __init__(self, extra):
msg = "This is a bug and should be reported. Info: {}.".format(extra)
super().__init__(msg)

+ 79
- 80
mwparserfromhell/parser/tokenizer.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,12 +18,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import html.entities as htmlentities
from math import log from math import log
import re import re


from . import contexts, tokens, ParserError
from ..compat import htmlentities, range
from . import contexts, tokens
from .errors import ParserError
from ..definitions import (get_html_tag, is_parsable, is_single, from ..definitions import (get_html_tag, is_parsable, is_single,
is_single_only, is_scheme) is_single_only, is_scheme)


@@ -35,11 +33,11 @@ class BadRoute(Exception):
"""Raised internally when the current tokenization route is invalid.""" """Raised internally when the current tokenization route is invalid."""


def __init__(self, context=0): def __init__(self, context=0):
super(BadRoute, self).__init__()
super().__init__()
self.context = context self.context = context




class _TagOpenData(object):
class _TagOpenData:
"""Stores data about an HTML open tag, like ``<ref name="foo">``.""" """Stores data about an HTML open tag, like ``<ref name="foo">``."""
CX_NAME = 1 << 0 CX_NAME = 1 << 0
CX_ATTR_READY = 1 << 1 CX_ATTR_READY = 1 << 1
@@ -57,7 +55,7 @@ class _TagOpenData(object):
self.reset = 0 self.reset = 0




class Tokenizer(object):
class Tokenizer:
"""Creates a list of tokens from a string of wikicode.""" """Creates a list of tokens from a string of wikicode."""
USES_C = False USES_C = False
START = object() START = object()
@@ -325,7 +323,7 @@ class Tokenizer(object):
self._head += 2 self._head += 2
try: try:
# If the wikilink looks like an external link, parse it as such: # If the wikilink looks like an external link, parse it as such:
link, extra, delta = self._really_parse_external_link(True)
link, _extra, _delta = self._really_parse_external_link(True)
except BadRoute: except BadRoute:
self._head = reset + 1 self._head = reset + 1
try: try:
@@ -435,17 +433,17 @@ class Tokenizer(object):
self._emit_text(this) self._emit_text(this)
return punct, tail return punct, tail


def _is_free_link_end(self, this, next):
def _is_free_link_end(self, this, nxt):
"""Return whether the current head is the end of a free link.""" """Return whether the current head is the end of a free link."""
# Built from _parse()'s end sentinels: # Built from _parse()'s end sentinels:
after, ctx = self._read(2), self._context after, ctx = self._read(2), self._context
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
return (this in (self.END, "\n", "[", "]", "<", ">") or return (this in (self.END, "\n", "[", "]", "<", ">") or
this == next == "'" or
this == nxt == "'" or
(this == "|" and ctx & contexts.TEMPLATE) or (this == "|" and ctx & contexts.TEMPLATE) or
(this == "=" and ctx & equal_sign_contexts) or (this == "=" and ctx & equal_sign_contexts) or
(this == next == "}" and ctx & contexts.TEMPLATE) or
(this == next == after == "}" and ctx & contexts.ARGUMENT))
(this == nxt == "}" and ctx & contexts.TEMPLATE) or
(this == nxt == after == "}" and ctx & contexts.ARGUMENT))


def _really_parse_external_link(self, brackets): def _really_parse_external_link(self, brackets):
"""Really parse an external link.""" """Really parse an external link."""
@@ -460,23 +458,23 @@ class Tokenizer(object):
self._fail_route() self._fail_route()
tail = "" tail = ""
while True: while True:
this, next = self._read(), self._read(1)
this, nxt = self._read(), self._read(1)
if this == "&": if this == "&":
if tail: if tail:
self._emit_text(tail) self._emit_text(tail)
tail = "" tail = ""
self._parse_entity() self._parse_entity()
elif (this == "<" and next == "!" and self._read(2) ==
elif (this == "<" and nxt == "!" and self._read(2) ==
self._read(3) == "-"): self._read(3) == "-"):
if tail: if tail:
self._emit_text(tail) self._emit_text(tail)
tail = "" tail = ""
self._parse_comment() self._parse_comment()
elif not brackets and self._is_free_link_end(this, next):
elif not brackets and self._is_free_link_end(this, nxt):
return self._pop(), tail, -1 return self._pop(), tail, -1
elif this is self.END or this == "\n": elif this is self.END or this == "\n":
self._fail_route() self._fail_route()
elif this == next == "{" and self._can_recurse():
elif this == nxt == "{" and self._can_recurse():
if tail: if tail:
self._emit_text(tail) self._emit_text(tail)
tail = "" tail = ""
@@ -704,12 +702,12 @@ class Tokenizer(object):


def _handle_tag_text(self, text): def _handle_tag_text(self, text):
"""Handle regular *text* inside of an HTML open tag.""" """Handle regular *text* inside of an HTML open tag."""
next = self._read(1)
nxt = self._read(1)
if not self._can_recurse() or text not in self.MARKERS: if not self._can_recurse() or text not in self.MARKERS:
self._emit_text(text) self._emit_text(text)
elif text == next == "{":
elif text == nxt == "{":
self._parse_template_or_argument() self._parse_template_or_argument()
elif text == next == "[":
elif text == nxt == "[":
self._parse_wikilink() self._parse_wikilink()
elif text == "<": elif text == "<":
self._parse_tag() self._parse_tag()
@@ -798,10 +796,10 @@ class Tokenizer(object):
"""Handle the body of an HTML tag that is parser-blacklisted.""" """Handle the body of an HTML tag that is parser-blacklisted."""
strip = lambda text: text.rstrip().lower() strip = lambda text: text.rstrip().lower()
while True: while True:
this, next = self._read(), self._read(1)
this, nxt = self._read(), self._read(1)
if this is self.END: if this is self.END:
self._fail_route() self._fail_route()
elif this == "<" and next == "/":
elif this == "<" and nxt == "/":
self._head += 3 self._head += 3
if self._read() != ">" or (strip(self._read(-1)) != if self._read() != ">" or (strip(self._read(-1)) !=
strip(self._stack[1].text)): strip(self._stack[1].text)):
@@ -856,7 +854,7 @@ class Tokenizer(object):
self._push(contexts.TAG_OPEN) self._push(contexts.TAG_OPEN)
self._emit(tokens.TagOpenOpen()) self._emit(tokens.TagOpenOpen())
while True: while True:
this, next = self._read(), self._read(1)
this, nxt = self._read(), self._read(1)
can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
data.context & data.CX_NOTE_SPACE) data.context & data.CX_NOTE_SPACE)
if this is self.END: if this is self.END:
@@ -878,7 +876,7 @@ class Tokenizer(object):
if is_parsable(self._stack[1].text): if is_parsable(self._stack[1].text):
return self._parse(push=False) return self._parse(push=False)
return self._handle_blacklisted_tag() return self._handle_blacklisted_tag()
elif this == "/" and next == ">" and can_exit:
elif this == "/" and nxt == ">" and can_exit:
self._handle_tag_close_open(data, tokens.TagCloseSelfclose) self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
return self._pop() return self._pop()
else: else:
@@ -935,9 +933,11 @@ class Tokenizer(object):
stack = self._parse(new_ctx) stack = self._parse(new_ctx)
except BadRoute: except BadRoute:
self._head = reset self._head = reset
return self._emit_text("''")
self._emit_text("''")
return
else: else:
return self._emit_text("''")
self._emit_text("''")
return
self._emit_style_tag("i", "''", stack) self._emit_style_tag("i", "''", stack)


def _parse_bold(self): def _parse_bold(self):
@@ -950,7 +950,7 @@ class Tokenizer(object):
if self._context & contexts.STYLE_SECOND_PASS: if self._context & contexts.STYLE_SECOND_PASS:
self._emit_text("'") self._emit_text("'")
return True return True
elif self._context & contexts.STYLE_ITALICS:
if self._context & contexts.STYLE_ITALICS:
self._context |= contexts.STYLE_PASS_AGAIN self._context |= contexts.STYLE_PASS_AGAIN
self._emit_text("'''") self._emit_text("'''")
else: else:
@@ -958,6 +958,7 @@ class Tokenizer(object):
self._parse_italics() self._parse_italics()
else: else:
self._emit_style_tag("b", "'''", stack) self._emit_style_tag("b", "'''", stack)
return False


def _parse_italics_and_bold(self): def _parse_italics_and_bold(self):
"""Parse wiki-style italics and bold together (i.e., five ticks).""" """Parse wiki-style italics and bold together (i.e., five ticks)."""
@@ -1019,7 +1020,7 @@ class Tokenizer(object):
if ticks == 5: if ticks == 5:
self._head -= 3 if italics else 2 self._head -= 3 if italics else 2
return self._pop() return self._pop()
elif not self._can_recurse():
if not self._can_recurse():
if ticks == 3: if ticks == 3:
if self._context & contexts.STYLE_SECOND_PASS: if self._context & contexts.STYLE_SECOND_PASS:
self._emit_text("'") self._emit_text("'")
@@ -1103,7 +1104,7 @@ class Tokenizer(object):
if this.isspace(): if this.isspace():
data.padding_buffer["first"] += this data.padding_buffer["first"] += this
return data.padding_buffer["first"] return data.padding_buffer["first"]
elif this is self.END or this == end_token:
if this is self.END or this == end_token:
if self._context & contexts.TAG_ATTR: if self._context & contexts.TAG_ATTR:
if data.context & data.CX_QUOTED: if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die # Unclosed attribute quote: reset, don't die
@@ -1243,9 +1244,9 @@ class Tokenizer(object):
if context & contexts.FAIL_NEXT: if context & contexts.FAIL_NEXT:
return False return False
if context & contexts.WIKILINK_TITLE: if context & contexts.WIKILINK_TITLE:
if this == "]" or this == "{":
if this in ("]", "{"):
self._context |= contexts.FAIL_NEXT self._context |= contexts.FAIL_NEXT
elif this == "\n" or this == "[" or this == "}" or this == ">":
elif this in ("\n", "[", "}", ">"):
return False return False
elif this == "<": elif this == "<":
if self._read(1) == "!": if self._read(1) == "!":
@@ -1253,16 +1254,16 @@ class Tokenizer(object):
else: else:
return False return False
return True return True
elif context & contexts.EXT_LINK_TITLE:
if context & contexts.EXT_LINK_TITLE:
return this != "\n" return this != "\n"
elif context & contexts.TEMPLATE_NAME:
if context & contexts.TEMPLATE_NAME:
if this == "{": if this == "{":
self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT
return True return True
if this == "}" or (this == "<" and self._read(1) == "!"): if this == "}" or (this == "<" and self._read(1) == "!"):
self._context |= contexts.FAIL_NEXT self._context |= contexts.FAIL_NEXT
return True return True
if this == "[" or this == "]" or this == "<" or this == ">":
if this in ("[", "]", "<", ">"):
return False return False
if this == "|": if this == "|":
return True return True
@@ -1275,30 +1276,29 @@ class Tokenizer(object):
elif this is self.END or not this.isspace(): elif this is self.END or not this.isspace():
self._context |= contexts.HAS_TEXT self._context |= contexts.HAS_TEXT
return True return True
elif context & contexts.TAG_CLOSE:
if context & contexts.TAG_CLOSE:
return this != "<" return this != "<"
else:
if context & contexts.FAIL_ON_EQUALS:
if this == "=":
return False
elif context & contexts.FAIL_ON_LBRACE:
if this == "{" or (self._read(-1) == self._read(-2) == "{"):
if context & contexts.TEMPLATE:
self._context |= contexts.FAIL_ON_EQUALS
else:
self._context |= contexts.FAIL_NEXT
return True
self._context ^= contexts.FAIL_ON_LBRACE
elif context & contexts.FAIL_ON_RBRACE:
if this == "}":
if context & contexts.FAIL_ON_EQUALS:
if this == "=":
return False
elif context & contexts.FAIL_ON_LBRACE:
if this == "{" or (self._read(-1) == self._read(-2) == "{"):
if context & contexts.TEMPLATE:
self._context |= contexts.FAIL_ON_EQUALS
else:
self._context |= contexts.FAIL_NEXT self._context |= contexts.FAIL_NEXT
return True
self._context ^= contexts.FAIL_ON_RBRACE
elif this == "{":
self._context |= contexts.FAIL_ON_LBRACE
elif this == "}":
self._context |= contexts.FAIL_ON_RBRACE
return True
return True
self._context ^= contexts.FAIL_ON_LBRACE
elif context & contexts.FAIL_ON_RBRACE:
if this == "}":
self._context |= contexts.FAIL_NEXT
return True
self._context ^= contexts.FAIL_ON_RBRACE
elif this == "{":
self._context |= contexts.FAIL_ON_LBRACE
elif this == "}":
self._context |= contexts.FAIL_ON_RBRACE
return True


def _parse(self, context=0, push=True): def _parse(self, context=0, push=True):
"""Parse the wikicode string, using *context* for when to stop.""" """Parse the wikicode string, using *context* for when to stop."""
@@ -1317,8 +1317,8 @@ class Tokenizer(object):
continue continue
if this is self.END: if this is self.END:
return self._handle_end() return self._handle_end()
next = self._read(1)
if this == next == "{":
nxt = self._read(1)
if this == nxt == "{":
if self._can_recurse(): if self._can_recurse():
self._parse_template_or_argument() self._parse_template_or_argument()
else: else:
@@ -1326,27 +1326,26 @@ class Tokenizer(object):
elif this == "|" and self._context & contexts.TEMPLATE: elif this == "|" and self._context & contexts.TEMPLATE:
self._handle_template_param() self._handle_template_param()
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
if not self._global & contexts.GL_HEADING and self._read(-1) in ("\n", self.START) and next == "=":
if not self._global & contexts.GL_HEADING and self._read(-1) in ("\n", self.START) and nxt == "=":
self._parse_heading() self._parse_heading()
else: else:
self._handle_template_param_value() self._handle_template_param_value()
elif this == next == "}" and self._context & contexts.TEMPLATE:
elif this == nxt == "}" and self._context & contexts.TEMPLATE:
return self._handle_template_end() return self._handle_template_end()
elif this == "|" and self._context & contexts.ARGUMENT_NAME: elif this == "|" and self._context & contexts.ARGUMENT_NAME:
self._handle_argument_separator() self._handle_argument_separator()
elif this == next == "}" and self._context & contexts.ARGUMENT:
elif this == nxt == "}" and self._context & contexts.ARGUMENT:
if self._read(2) == "}": if self._read(2) == "}":
return self._handle_argument_end() return self._handle_argument_end()
else:
self._emit_text("}")
elif this == next == "[" and self._can_recurse():
self._emit_text("}")
elif this == nxt == "[" and self._can_recurse():
if not self._context & contexts.NO_WIKILINKS: if not self._context & contexts.NO_WIKILINKS:
self._parse_wikilink() self._parse_wikilink()
else: else:
self._emit_text("[") self._emit_text("[")
elif this == "|" and self._context & contexts.WIKILINK_TITLE: elif this == "|" and self._context & contexts.WIKILINK_TITLE:
self._handle_wikilink_separator() self._handle_wikilink_separator()
elif this == next == "]" and self._context & contexts.WIKILINK:
elif this == nxt == "]" and self._context & contexts.WIKILINK:
return self._handle_wikilink_end() return self._handle_wikilink_end()
elif this == "[": elif this == "[":
self._parse_external_link(True) self._parse_external_link(True)
@@ -1365,12 +1364,12 @@ class Tokenizer(object):
self._fail_route() self._fail_route()
elif this == "&": elif this == "&":
self._parse_entity() self._parse_entity()
elif this == "<" and next == "!":
elif this == "<" and nxt == "!":
if self._read(2) == self._read(3) == "-": if self._read(2) == self._read(3) == "-":
self._parse_comment() self._parse_comment()
else: else:
self._emit_text(this) self._emit_text(this)
elif this == "<" and next == "/" and self._read(2) is not self.END:
elif this == "<" and nxt == "/" and self._read(2) is not self.END:
if self._context & contexts.TAG_BODY: if self._context & contexts.TAG_BODY:
self._handle_tag_open_close() self._handle_tag_open_close()
else: else:
@@ -1382,14 +1381,14 @@ class Tokenizer(object):
self._emit_text("<") self._emit_text("<")
elif this == ">" and self._context & contexts.TAG_CLOSE: elif this == ">" and self._context & contexts.TAG_CLOSE:
return self._handle_tag_close_close() return self._handle_tag_close_close()
elif this == next == "'" and not self._skip_style_tags:
elif this == nxt == "'" and not self._skip_style_tags:
result = self._parse_style() result = self._parse_style()
if result is not None: if result is not None:
return result return result
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
self._handle_list() self._handle_list()
elif self._read(-1) in ("\n", self.START) and ( elif self._read(-1) in ("\n", self.START) and (
this == next == self._read(2) == self._read(3) == "-"):
this == nxt == self._read(2) == self._read(3) == "-"):
self._handle_hr() self._handle_hr()
elif this in ("\n", ":") and self._context & contexts.DL_TERM: elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term() self._handle_dl_term()
@@ -1397,7 +1396,7 @@ class Tokenizer(object):
# Kill potential table contexts # Kill potential table contexts
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
# Start of table parsing # Start of table parsing
elif this == "{" and next == "|" and (
elif this == "{" and nxt == "|" and (
self._read(-1) in ("\n", self.START) or self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
if self._can_recurse(): if self._can_recurse():
@@ -1405,15 +1404,15 @@ class Tokenizer(object):
else: else:
self._emit_text("{") self._emit_text("{")
elif self._context & contexts.TABLE_OPEN: elif self._context & contexts.TABLE_OPEN:
if this == next == "|" and self._context & contexts.TABLE_TD_LINE:
if this == nxt == "|" and self._context & contexts.TABLE_TD_LINE:
if self._context & contexts.TABLE_CELL_OPEN: if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end() return self._handle_table_cell_end()
self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE)
elif this == next == "|" and self._context & contexts.TABLE_TH_LINE:
elif this == nxt == "|" and self._context & contexts.TABLE_TH_LINE:
if self._context & contexts.TABLE_CELL_OPEN: if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end() return self._handle_table_cell_end()
self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE)
elif this == next == "!" and self._context & contexts.TABLE_TH_LINE:
elif this == nxt == "!" and self._context & contexts.TABLE_TH_LINE:
if self._context & contexts.TABLE_CELL_OPEN: if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end() return self._handle_table_cell_end()
self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE)
@@ -1425,13 +1424,13 @@ class Tokenizer(object):
self._emit_text(this) self._emit_text(this)
elif (self._read(-1) in ("\n", self.START) or elif (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
if this == "|" and next == "}":
if this == "|" and nxt == "}":
if self._context & contexts.TABLE_CELL_OPEN: if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end() return self._handle_table_cell_end()
if self._context & contexts.TABLE_ROW_OPEN: if self._context & contexts.TABLE_ROW_OPEN:
return self._handle_table_row_end() return self._handle_table_row_end()
return self._handle_table_end() return self._handle_table_end()
elif this == "|" and next == "-":
if this == "|" and nxt == "-":
if self._context & contexts.TABLE_CELL_OPEN: if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end() return self._handle_table_cell_end()
if self._context & contexts.TABLE_ROW_OPEN: if self._context & contexts.TABLE_ROW_OPEN:
@@ -1463,10 +1462,10 @@ class Tokenizer(object):
self._skip_style_tags = skip_style_tags self._skip_style_tags = skip_style_tags


try: try:
tokens = self._parse(context)
except BadRoute: # pragma: no cover (untestable/exceptional case)
raise ParserError("Python tokenizer exited with BadRoute")
result = self._parse(context)
except BadRoute as exc: # pragma: no cover (untestable/exceptional case)
raise ParserError("Python tokenizer exited with BadRoute") from exc
if self._stacks: # pragma: no cover (untestable/exceptional case) if self._stacks: # pragma: no cover (untestable/exceptional case)
err = "Python tokenizer exited with non-empty token stack" err = "Python tokenizer exited with non-empty token stack"
raise ParserError(err) raise ParserError(err)
return tokens
return result

+ 2
- 8
mwparserfromhell/parser/tokens.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -28,10 +26,6 @@ a syntactically valid form by the :class:`.Tokenizer`, and then converted into
the :class`.Wikicode` tree by the :class:`.Builder`. the :class`.Wikicode` tree by the :class:`.Builder`.
""" """


from __future__ import unicode_literals

from ..compat import py3k, str

__all__ = ["Token"] __all__ = ["Token"]


class Token(dict): class Token(dict):
@@ -65,7 +59,7 @@ class Token(dict):
def make(name): def make(name):
"""Create a new Token class using ``type()`` and add it to ``__all__``.""" """Create a new Token class using ``type()`` and add it to ``__all__``."""
__all__.append(name) __all__.append(name)
return type(name if py3k else name.encode("utf8"), (Token,), {})
return type(name, (Token,), {})


Text = make("Text") Text = make("Text")




+ 0
- 456
mwparserfromhell/smart_list.py View File

@@ -1,456 +0,0 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
This module contains the :class:`.SmartList` type, as well as its
:class:`._ListProxy` child, which together implement a list whose sublists
reflect changes made to the main list, and vice-versa.
"""

from __future__ import unicode_literals
from sys import maxsize
from weakref import ref

from .compat import py3k

__all__ = ["SmartList"]

def inheritdoc(method):
"""Set __doc__ of *method* to __doc__ of *method* in its parent class.

Since this is used on :class:`.SmartList`, the "parent class" used is
``list``. This function can be used as a decorator.
"""
method.__doc__ = getattr(list, method.__name__).__doc__
return method


class _SliceNormalizerMixIn(object):
"""MixIn that provides a private method to normalize slices."""

def _normalize_slice(self, key, clamp=False):
"""Return a slice equivalent to the input *key*, standardized."""
if key.start is None:
start = 0
else:
start = (len(self) + key.start) if key.start < 0 else key.start
if key.stop is None or key.stop == maxsize:
stop = len(self) if clamp else None
else:
stop = (len(self) + key.stop) if key.stop < 0 else key.stop
return slice(start, stop, key.step or 1)


class SmartList(_SliceNormalizerMixIn, list):
"""Implements the ``list`` interface with special handling of sublists.

When a sublist is created (by ``list[i:j]``), any changes made to this
list (such as the addition, removal, or replacement of elements) will be
reflected in the sublist, or vice-versa, to the greatest degree possible.
This is implemented by having sublists - instances of the
:class:`._ListProxy` type - dynamically determine their elements by storing
their slice info and retrieving that slice from the parent. Methods that
change the size of the list also change the slice info. For example::

>>> parent = SmartList([0, 1, 2, 3])
>>> parent
[0, 1, 2, 3]
>>> child = parent[2:]
>>> child
[2, 3]
>>> child.append(4)
>>> child
[2, 3, 4]
>>> parent
[0, 1, 2, 3, 4]
"""

def __init__(self, iterable=None):
if iterable:
super(SmartList, self).__init__(iterable)
else:
super(SmartList, self).__init__()
self._children = {}

def __getitem__(self, key):
if not isinstance(key, slice):
return super(SmartList, self).__getitem__(key)
key = self._normalize_slice(key, clamp=False)
sliceinfo = [key.start, key.stop, key.step]
child = _ListProxy(self, sliceinfo)
child_ref = ref(child, self._delete_child)
self._children[id(child_ref)] = (child_ref, sliceinfo)
return child

def __setitem__(self, key, item):
if not isinstance(key, slice):
return super(SmartList, self).__setitem__(key, item)
item = list(item)
super(SmartList, self).__setitem__(key, item)
key = self._normalize_slice(key, clamp=True)
diff = len(item) + (key.start - key.stop) // key.step
if not diff:
return
values = self._children.values if py3k else self._children.itervalues
for child, (start, stop, step) in values():
if start > key.stop:
self._children[id(child)][1][0] += diff
if stop is not None and stop >= key.stop:
self._children[id(child)][1][1] += diff

def __delitem__(self, key):
super(SmartList, self).__delitem__(key)
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
else:
key = slice(key, key + 1, 1)
diff = (key.stop - key.start) // key.step
values = self._children.values if py3k else self._children.itervalues
for child, (start, stop, step) in values():
if start > key.start:
self._children[id(child)][1][0] -= diff
if stop is not None and stop >= key.stop:
self._children[id(child)][1][1] -= diff

if not py3k:
def __getslice__(self, start, stop):
return self.__getitem__(slice(start, stop))

def __setslice__(self, start, stop, iterable):
self.__setitem__(slice(start, stop), iterable)

def __delslice__(self, start, stop):
self.__delitem__(slice(start, stop))

def __add__(self, other):
return SmartList(list(self) + other)

def __radd__(self, other):
return SmartList(other + list(self))

def __iadd__(self, other):
self.extend(other)
return self

def _delete_child(self, child_ref):
"""Remove a child reference that is about to be garbage-collected."""
del self._children[id(child_ref)]

def _detach_children(self):
"""Remove all children and give them independent parent copies."""
children = [val[0] for val in self._children.values()]
for child in children:
child()._parent = list(self)
self._children.clear()

@inheritdoc
def append(self, item):
head = len(self)
self[head:head] = [item]

@inheritdoc
def extend(self, item):
head = len(self)
self[head:head] = item

@inheritdoc
def insert(self, index, item):
self[index:index] = [item]

@inheritdoc
def pop(self, index=None):
if index is None:
index = len(self) - 1
item = self[index]
del self[index]
return item

@inheritdoc
def remove(self, item):
del self[self.index(item)]

@inheritdoc
def reverse(self):
self._detach_children()
super(SmartList, self).reverse()

if py3k:
@inheritdoc
def sort(self, key=None, reverse=None):
self._detach_children()
kwargs = {}
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
super(SmartList, self).sort(**kwargs)
else:
@inheritdoc
def sort(self, cmp=None, key=None, reverse=None):
self._detach_children()
kwargs = {}
if cmp is not None:
kwargs["cmp"] = cmp
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
super(SmartList, self).sort(**kwargs)


class _ListProxy(_SliceNormalizerMixIn, list):
"""Implement the ``list`` interface by getting elements from a parent.

This is created by a :class:`.SmartList` object when slicing. It does not
actually store the list at any time; instead, whenever the list is needed,
it builds it dynamically using the :meth:`_render` method.
"""

def __init__(self, parent, sliceinfo):
super(_ListProxy, self).__init__()
self._parent = parent
self._sliceinfo = sliceinfo

def __repr__(self):
return repr(self._render())

def __lt__(self, other):
if isinstance(other, _ListProxy):
return self._render() < list(other)
return self._render() < other

def __le__(self, other):
if isinstance(other, _ListProxy):
return self._render() <= list(other)
return self._render() <= other

def __eq__(self, other):
if isinstance(other, _ListProxy):
return self._render() == list(other)
return self._render() == other

def __ne__(self, other):
if isinstance(other, _ListProxy):
return self._render() != list(other)
return self._render() != other

def __gt__(self, other):
if isinstance(other, _ListProxy):
return self._render() > list(other)
return self._render() > other

def __ge__(self, other):
if isinstance(other, _ListProxy):
return self._render() >= list(other)
return self._render() >= other

if py3k:
def __bool__(self):
return bool(self._render())
else:
def __nonzero__(self):
return bool(self._render())

def __len__(self):
return max((self._stop - self._start) // self._step, 0)

def __getitem__(self, key):
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
return self._parent[adjusted]
else:
return self._render()[key]

def __setitem__(self, key, item):
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
self._parent[adjusted] = item
else:
length = len(self)
if key < 0:
key = length + key
if key < 0 or key >= length:
raise IndexError("list assignment index out of range")
self._parent[self._start + key] = item

def __delitem__(self, key):
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
del self._parent[adjusted]
else:
length = len(self)
if key < 0:
key = length + key
if key < 0 or key >= length:
raise IndexError("list assignment index out of range")
del self._parent[self._start + key]

def __iter__(self):
i = self._start
while i < self._stop:
yield self._parent[i]
i += self._step

def __reversed__(self):
i = self._stop - 1
while i >= self._start:
yield self._parent[i]
i -= self._step

def __contains__(self, item):
return item in self._render()

if not py3k:
def __getslice__(self, start, stop):
return self.__getitem__(slice(start, stop))

def __setslice__(self, start, stop, iterable):
self.__setitem__(slice(start, stop), iterable)

def __delslice__(self, start, stop):
self.__delitem__(slice(start, stop))

def __add__(self, other):
return SmartList(list(self) + other)

def __radd__(self, other):
return SmartList(other + list(self))

def __iadd__(self, other):
self.extend(other)
return self

def __mul__(self, other):
return SmartList(list(self) * other)

def __rmul__(self, other):
return SmartList(other * list(self))

def __imul__(self, other):
self.extend(list(self) * (other - 1))
return self

@property
def _start(self):
"""The starting index of this list, inclusive."""
return self._sliceinfo[0]

@property
def _stop(self):
"""The ending index of this list, exclusive."""
if self._sliceinfo[1] is None:
return len(self._parent)
return self._sliceinfo[1]

@property
def _step(self):
"""The number to increase the index by between items."""
return self._sliceinfo[2]

def _render(self):
"""Return the actual list from the stored start/stop/step."""
return list(self._parent)[self._start:self._stop:self._step]

@inheritdoc
def append(self, item):
self._parent.insert(self._stop, item)

@inheritdoc
def count(self, item):
return self._render().count(item)

@inheritdoc
def index(self, item, start=None, stop=None):
if start is not None:
if stop is not None:
return self._render().index(item, start, stop)
return self._render().index(item, start)
return self._render().index(item)

@inheritdoc
def extend(self, item):
self._parent[self._stop:self._stop] = item

@inheritdoc
def insert(self, index, item):
if index < 0:
index = len(self) + index
self._parent.insert(self._start + index, item)

@inheritdoc
def pop(self, index=None):
length = len(self)
if index is None:
index = length - 1
elif index < 0:
index = length + index
if index < 0 or index >= length:
raise IndexError("pop index out of range")
return self._parent.pop(self._start + index)

@inheritdoc
def remove(self, item):
index = self.index(item)
del self._parent[self._start + index]

@inheritdoc
def reverse(self):
item = self._render()
item.reverse()
self._parent[self._start:self._stop:self._step] = item

if py3k:
@inheritdoc
def sort(self, key=None, reverse=None):
item = self._render()
kwargs = {}
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item
else:
@inheritdoc
def sort(self, cmp=None, key=None, reverse=None):
item = self._render()
kwargs = {}
if cmp is not None:
kwargs["cmp"] = cmp
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item


del inheritdoc

+ 29
- 0
mwparserfromhell/smart_list/__init__.py View File

@@ -0,0 +1,29 @@
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
This module contains the :class:`.SmartList` type, as well as its
:class:`.ListProxy` child, which together implement a list whose sublists
reflect changes made to the main list, and vice-versa.
"""

from .list_proxy import ListProxy as _ListProxy
from .smart_list import SmartList

+ 229
- 0
mwparserfromhell/smart_list/list_proxy.py View File

@@ -0,0 +1,229 @@
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from .utils import _SliceNormalizerMixIn, inheritdoc


class ListProxy(_SliceNormalizerMixIn, list):
"""Implement the ``list`` interface by getting elements from a parent.

This is created by a :class:`.SmartList` object when slicing. It does not
actually store the list at any time; instead, whenever the list is needed,
it builds it dynamically using the :meth:`_render` method.
"""

def __init__(self, parent, sliceinfo):
super().__init__()
self._parent = parent
self._sliceinfo = sliceinfo

def __repr__(self):
return repr(self._render())

def __lt__(self, other):
if isinstance(other, ListProxy):
return self._render() < list(other)
return self._render() < other

def __le__(self, other):
if isinstance(other, ListProxy):
return self._render() <= list(other)
return self._render() <= other

def __eq__(self, other):
if isinstance(other, ListProxy):
return self._render() == list(other)
return self._render() == other

def __ne__(self, other):
if isinstance(other, ListProxy):
return self._render() != list(other)
return self._render() != other

def __gt__(self, other):
if isinstance(other, ListProxy):
return self._render() > list(other)
return self._render() > other

def __ge__(self, other):
if isinstance(other, ListProxy):
return self._render() >= list(other)
return self._render() >= other

def __bool__(self):
return bool(self._render())

def __len__(self):
return max((self._stop - self._start) // self._step, 0)

def __getitem__(self, key):
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
return self._parent[adjusted]
return self._render()[key]

def __setitem__(self, key, item):
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
self._parent[adjusted] = item
else:
length = len(self)
if key < 0:
key = length + key
if key < 0 or key >= length:
raise IndexError("list assignment index out of range")
self._parent[self._start + key] = item

def __delitem__(self, key):
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
keystart = min(self._start + key.start, self._stop)
keystop = min(self._start + key.stop, self._stop)
adjusted = slice(keystart, keystop, key.step)
del self._parent[adjusted]
else:
length = len(self)
if key < 0:
key = length + key
if key < 0 or key >= length:
raise IndexError("list assignment index out of range")
del self._parent[self._start + key]

def __iter__(self):
i = self._start
while i < self._stop:
yield self._parent[i]
i += self._step

def __reversed__(self):
i = self._stop - 1
while i >= self._start:
yield self._parent[i]
i -= self._step

def __contains__(self, item):
return item in self._render()

def __add__(self, other):
return type(self._parent)(list(self) + other)

def __radd__(self, other):
return type(self._parent)(other + list(self))

def __iadd__(self, other):
self.extend(other)
return self

def __mul__(self, other):
return type(self._parent)(list(self) * other)

def __rmul__(self, other):
return type(self._parent)(other * list(self))

def __imul__(self, other):
self.extend(list(self) * (other - 1))
return self

@property
def _start(self):
"""The starting index of this list, inclusive."""
return self._sliceinfo[0]

@property
def _stop(self):
"""The ending index of this list, exclusive."""
if self._sliceinfo[1] is None:
return len(self._parent)
return self._sliceinfo[1]

@property
def _step(self):
"""The number to increase the index by between items."""
return self._sliceinfo[2]

def _render(self):
"""Return the actual list from the stored start/stop/step."""
return list(self._parent)[self._start:self._stop:self._step]

@inheritdoc
def append(self, item):
self._parent.insert(self._stop, item)

@inheritdoc
def count(self, item):
return self._render().count(item)

@inheritdoc
def index(self, item, start=None, stop=None):
if start is not None:
if stop is not None:
return self._render().index(item, start, stop)
return self._render().index(item, start)
return self._render().index(item)

@inheritdoc
def extend(self, item):
self._parent[self._stop:self._stop] = item

@inheritdoc
def insert(self, index, item):
if index < 0:
index = len(self) + index
self._parent.insert(self._start + index, item)

@inheritdoc
def pop(self, index=None):
length = len(self)
if index is None:
index = length - 1
elif index < 0:
index = length + index
if index < 0 or index >= length:
raise IndexError("pop index out of range")
return self._parent.pop(self._start + index)

@inheritdoc
def remove(self, item):
index = self.index(item)
del self._parent[self._start + index]

@inheritdoc
def reverse(self):
item = self._render()
item.reverse()
self._parent[self._start:self._stop:self._step] = item

@inheritdoc
def sort(self, key=None, reverse=None):
item = self._render()
kwargs = {}
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item

+ 158
- 0
mwparserfromhell/smart_list/smart_list.py View File

@@ -0,0 +1,158 @@
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from weakref import ref

from .list_proxy import ListProxy
from .utils import _SliceNormalizerMixIn, inheritdoc


class SmartList(_SliceNormalizerMixIn, list):
"""Implements the ``list`` interface with special handling of sublists.

When a sublist is created (by ``list[i:j]``), any changes made to this
list (such as the addition, removal, or replacement of elements) will be
reflected in the sublist, or vice-versa, to the greatest degree possible.
This is implemented by having sublists - instances of the
:class:`.ListProxy` type - dynamically determine their elements by storing
their slice info and retrieving that slice from the parent. Methods that
change the size of the list also change the slice info. For example::

>>> parent = SmartList([0, 1, 2, 3])
>>> parent
[0, 1, 2, 3]
>>> child = parent[2:]
>>> child
[2, 3]
>>> child.append(4)
>>> child
[2, 3, 4]
>>> parent
[0, 1, 2, 3, 4]
"""

def __init__(self, iterable=None):
if iterable:
super().__init__(iterable)
else:
super().__init__()
self._children = {}

def __getitem__(self, key):
if not isinstance(key, slice):
return super().__getitem__(key)
key = self._normalize_slice(key, clamp=False)
sliceinfo = [key.start, key.stop, key.step]
child = ListProxy(self, sliceinfo)
child_ref = ref(child, self._delete_child)
self._children[id(child_ref)] = (child_ref, sliceinfo)
return child

def __setitem__(self, key, item):
if not isinstance(key, slice):
super().__setitem__(key, item)
return
item = list(item)
super().__setitem__(key, item)
key = self._normalize_slice(key, clamp=True)
diff = len(item) + (key.start - key.stop) // key.step
if not diff:
return
for child, (start, stop, _step) in self._children.values():
if start > key.stop:
self._children[id(child)][1][0] += diff
if stop is not None and stop >= key.stop:
self._children[id(child)][1][1] += diff

def __delitem__(self, key):
super().__delitem__(key)
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
else:
key = slice(key, key + 1, 1)
diff = (key.stop - key.start) // key.step
for child, (start, stop, _step) in self._children.values():
if start > key.start:
self._children[id(child)][1][0] -= diff
if stop is not None and stop >= key.stop:
self._children[id(child)][1][1] -= diff

def __add__(self, other):
return SmartList(list(self) + other)

def __radd__(self, other):
return SmartList(other + list(self))

def __iadd__(self, other):
self.extend(other)
return self

def _delete_child(self, child_ref):
"""Remove a child reference that is about to be garbage-collected."""
del self._children[id(child_ref)]

def _detach_children(self):
"""Remove all children and give them independent parent copies."""
children = [val[0] for val in self._children.values()]
for child in children:
child()._parent = list(self)
self._children.clear()

@inheritdoc
def append(self, item):
head = len(self)
self[head:head] = [item]

@inheritdoc
def extend(self, item):
head = len(self)
self[head:head] = item

@inheritdoc
def insert(self, index, item):
self[index:index] = [item]

@inheritdoc
def pop(self, index=None):
if index is None:
index = len(self) - 1
item = self[index]
del self[index]
return item

@inheritdoc
def remove(self, item):
del self[self.index(item)]

@inheritdoc
def reverse(self):
self._detach_children()
super().reverse()

@inheritdoc
def sort(self, key=None, reverse=None):
self._detach_children()
kwargs = {}
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
super().sort(**kwargs)

+ 50
- 0
mwparserfromhell/smart_list/utils.py View File

@@ -0,0 +1,50 @@
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from sys import maxsize

__all__ = []


def inheritdoc(method):
"""Set __doc__ of *method* to __doc__ of *method* in its parent class.

Since this is used on :class:`.SmartList`, the "parent class" used is
``list``. This function can be used as a decorator.
"""
method.__doc__ = getattr(list, method.__name__).__doc__
return method


class _SliceNormalizerMixIn:
"""MixIn that provides a private method to normalize slices."""

def _normalize_slice(self, key, clamp=False):
"""Return a slice equivalent to the input *key*, standardized."""
if key.start is None:
start = 0
else:
start = (len(self) + key.start) if key.start < 0 else key.start
if key.stop is None or key.stop == maxsize:
stop = len(self) if clamp else None
else:
stop = (len(self) + key.stop) if key.stop < 0 else key.stop
return slice(start, stop, key.step or 1)

+ 28
- 46
mwparserfromhell/string_mixin.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -22,14 +20,11 @@


""" """
This module contains the :class:`.StringMixIn` type, which implements the This module contains the :class:`.StringMixIn` type, which implements the
interface for the ``unicode`` type (``str`` on py3k) in a dynamic manner.
interface for the ``str`` type in a dynamic manner.
""" """


from __future__ import unicode_literals
from sys import getdefaultencoding from sys import getdefaultencoding


from .compat import bytes, py3k, str

__all__ = ["StringMixIn"] __all__ = ["StringMixIn"]


def inheritdoc(method): def inheritdoc(method):
@@ -41,80 +36,67 @@ def inheritdoc(method):
method.__doc__ = getattr(str, method.__name__).__doc__ method.__doc__ = getattr(str, method.__name__).__doc__
return method return method


class StringMixIn(object):
"""Implement the interface for ``unicode``/``str`` in a dynamic manner.
class StringMixIn:
"""Implement the interface for ``str`` in a dynamic manner.


To use this class, inherit from it and override the :meth:`__unicode__`
method (same on py3k) to return the string representation of the object.
The various string methods will operate on the value of :meth:`__unicode__`
instead of the immutable ``self`` like the regular ``str`` type.
To use this class, inherit from it and override the :meth:`__str__` method
to return the string representation of the object. The various string
methods will operate on the value of :meth:`__str__` instead of the
immutable ``self`` like the regular ``str`` type.
""" """


if py3k:
def __str__(self):
return self.__unicode__()

def __bytes__(self):
return bytes(self.__unicode__(), getdefaultencoding())
else:
def __str__(self):
return bytes(self.__unicode__())

def __unicode__(self):
def __str__(self):
raise NotImplementedError() raise NotImplementedError()


def __bytes__(self):
return bytes(self.__str__(), getdefaultencoding())

def __repr__(self): def __repr__(self):
return repr(self.__unicode__())
return repr(self.__str__())


def __lt__(self, other): def __lt__(self, other):
return self.__unicode__() < other
return self.__str__() < other


def __le__(self, other): def __le__(self, other):
return self.__unicode__() <= other
return self.__str__() <= other


def __eq__(self, other): def __eq__(self, other):
return self.__unicode__() == other
return self.__str__() == other


def __ne__(self, other): def __ne__(self, other):
return self.__unicode__() != other
return self.__str__() != other


def __gt__(self, other): def __gt__(self, other):
return self.__unicode__() > other
return self.__str__() > other


def __ge__(self, other): def __ge__(self, other):
return self.__unicode__() >= other
return self.__str__() >= other


if py3k:
def __bool__(self):
return bool(self.__unicode__())
else:
def __nonzero__(self):
return bool(self.__unicode__())
def __bool__(self):
return bool(self.__str__())


def __len__(self): def __len__(self):
return len(self.__unicode__())
return len(self.__str__())


def __iter__(self): def __iter__(self):
for char in self.__unicode__():
yield char
yield from self.__str__()


def __getitem__(self, key): def __getitem__(self, key):
return self.__unicode__()[key]
return self.__str__()[key]


def __reversed__(self): def __reversed__(self):
return reversed(self.__unicode__())
return reversed(self.__str__())


def __contains__(self, item): def __contains__(self, item):
return str(item) in self.__unicode__()
return str(item) in self.__str__()


def __getattr__(self, attr): def __getattr__(self, attr):
if not hasattr(str, attr): if not hasattr(str, attr):
raise AttributeError("{!r} object has no attribute {!r}".format( raise AttributeError("{!r} object has no attribute {!r}".format(
type(self).__name__, attr)) type(self).__name__, attr))
return getattr(self.__unicode__(), attr)
return getattr(self.__str__(), attr)


if py3k:
maketrans = str.maketrans # Static method can't rely on __getattr__
maketrans = str.maketrans # Static method can't rely on __getattr__




del inheritdoc del inheritdoc

+ 20
- 25
mwparserfromhell/utils.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -25,50 +23,47 @@ This module contains accessory functions for other parts of the library. Parser
users generally won't need stuff from here. users generally won't need stuff from here.
""" """


from __future__ import unicode_literals

from .compat import bytes, str
from .nodes import Node
from .smart_list import SmartList

__all__ = ["parse_anything"] __all__ = ["parse_anything"]


def parse_anything(value, context=0, skip_style_tags=False): def parse_anything(value, context=0, skip_style_tags=False):
"""Return a :class:`.Wikicode` for *value*, allowing multiple types. """Return a :class:`.Wikicode` for *value*, allowing multiple types.


This differs from :meth:`.Parser.parse` in that we accept more than just a This differs from :meth:`.Parser.parse` in that we accept more than just a
string to be parsed. Unicode objects (strings in py3k), strings (bytes in
py3k), integers (converted to strings), ``None``, existing :class:`.Node`
or :class:`.Wikicode` objects, as well as an iterable of these types, are
supported. This is used to parse input on-the-fly by various methods of
:class:`.Wikicode` and others like :class:`.Template`, such as
:meth:`wikicode.insert() <.Wikicode.insert>` or setting
:meth:`template.name <.Template.name>`.
string to be parsed. Strings, bytes, integers (converted to strings),
``None``, existing :class:`.Node` or :class:`.Wikicode` objects, as well
as an iterable of these types, are supported. This is used to parse input
on-the-fly by various methods of :class:`.Wikicode` and others like
:class:`.Template`, such as :meth:`wikicode.insert() <.Wikicode.insert>`
or setting :meth:`template.name <.Template.name>`.


Additional arguments are passed directly to :meth:`.Parser.parse`. Additional arguments are passed directly to :meth:`.Parser.parse`.
""" """
# pylint: disable=cyclic-import,import-outside-toplevel
from .nodes import Node
from .parser import Parser from .parser import Parser
from .smart_list import SmartList
from .wikicode import Wikicode from .wikicode import Wikicode


if isinstance(value, Wikicode): if isinstance(value, Wikicode):
return value return value
elif isinstance(value, Node):
if isinstance(value, Node):
return Wikicode(SmartList([value])) return Wikicode(SmartList([value]))
elif isinstance(value, str):
if isinstance(value, str):
return Parser().parse(value, context, skip_style_tags) return Parser().parse(value, context, skip_style_tags)
elif isinstance(value, bytes):
if isinstance(value, bytes):
return Parser().parse(value.decode("utf8"), context, skip_style_tags) return Parser().parse(value.decode("utf8"), context, skip_style_tags)
elif isinstance(value, int):
if isinstance(value, int):
return Parser().parse(str(value), context, skip_style_tags) return Parser().parse(str(value), context, skip_style_tags)
elif value is None:
if value is None:
return Wikicode(SmartList()) return Wikicode(SmartList())
elif hasattr(value, "read"):
if hasattr(value, "read"):
return parse_anything(value.read(), context, skip_style_tags) return parse_anything(value.read(), context, skip_style_tags)
try: try:
nodelist = SmartList() nodelist = SmartList()
for item in value: for item in value:
nodelist += parse_anything(item, context, skip_style_tags).nodes nodelist += parse_anything(item, context, skip_style_tags).nodes
return Wikicode(nodelist) return Wikicode(nodelist)
except TypeError:
error = "Needs string, Node, Wikicode, file, int, None, or iterable of these, but got {0}: {1}"
raise ValueError(error.format(type(value).__name__, value))
except TypeError as exc:
error = ("Needs string, Node, Wikicode, file, int, None, or "
"iterable of these, but got {0}: {1}")
raise ValueError(error.format(type(value).__name__, value)) from exc

+ 20
- 25
mwparserfromhell/wikicode.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,14 +18,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
from itertools import chain
import re import re
from itertools import chain


from .compat import bytes, py3k, range, str
from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity,
Node, Tag, Template, Text, Wikilink) Node, Tag, Template, Text, Wikilink)
from .smart_list import _ListProxy
from .smart_list.list_proxy import ListProxy
from .string_mixin import StringMixIn from .string_mixin import StringMixIn
from .utils import parse_anything from .utils import parse_anything


@@ -48,10 +44,10 @@ class Wikicode(StringMixIn):
RECURSE_OTHERS = 2 RECURSE_OTHERS = 2


def __init__(self, nodes): def __init__(self, nodes):
super(Wikicode, self).__init__()
super().__init__()
self._nodes = nodes self._nodes = nodes


def __unicode__(self):
def __str__(self):
return "".join([str(node) for node in self.nodes]) return "".join([str(node) for node in self.nodes])


@staticmethod @staticmethod
@@ -63,8 +59,7 @@ class Wikicode(StringMixIn):
for code in node.__children__(): for code in node.__children__():
for child in code.nodes: for child in code.nodes:
sub = Wikicode._get_children(child, contexts, restrict, code) sub = Wikicode._get_children(child, contexts, restrict, code)
for result in sub:
yield result
yield from sub


@staticmethod @staticmethod
def _slice_replace(code, index, old, new): def _slice_replace(code, index, old, new):
@@ -112,7 +107,7 @@ class Wikicode(StringMixIn):
def _is_child_wikicode(self, obj, recursive=True): def _is_child_wikicode(self, obj, recursive=True):
"""Return whether the given :class:`.Wikicode` is a descendant.""" """Return whether the given :class:`.Wikicode` is a descendant."""
def deref(nodes): def deref(nodes):
if isinstance(nodes, _ListProxy):
if isinstance(nodes, ListProxy):
return nodes._parent # pylint: disable=protected-access return nodes._parent # pylint: disable=protected-access
return nodes return nodes


@@ -252,13 +247,13 @@ class Wikicode(StringMixIn):
self.ifilter(forcetype=ftype, *a, **kw)) self.ifilter(forcetype=ftype, *a, **kw))
make_filter = lambda ftype: (lambda self, *a, **kw: make_filter = lambda ftype: (lambda self, *a, **kw:
self.filter(forcetype=ftype, *a, **kw)) self.filter(forcetype=ftype, *a, **kw))
for name, ftype in (meths.items() if py3k else meths.iteritems()):
ifilter = make_ifilter(ftype)
filter = make_filter(ftype)
ifilter.__doc__ = doc.format(name, "ifilter", ftype)
filter.__doc__ = doc.format(name, "filter", ftype)
setattr(cls, "ifilter_" + name, ifilter)
setattr(cls, "filter_" + name, filter)
for name, ftype in meths.items():
ifilt = make_ifilter(ftype)
filt = make_filter(ftype)
ifilt.__doc__ = doc.format(name, "ifilter", ftype)
filt.__doc__ = doc.format(name, "filter", ftype)
setattr(cls, "ifilter_" + name, ifilt)
setattr(cls, "filter_" + name, filt)


@property @property
def nodes(self): def nodes(self):
@@ -355,6 +350,7 @@ class Wikicode(StringMixIn):
ancestors = _get_ancestors(code, needle) ancestors = _get_ancestors(code, needle)
if ancestors is not None: if ancestors is not None:
return [node] + ancestors return [node] + ancestors
return None


if isinstance(obj, Wikicode): if isinstance(obj, Wikicode):
obj = obj.get(0) obj = obj.get(0)
@@ -447,13 +443,13 @@ class Wikicode(StringMixIn):
""" """
if isinstance(obj, (Node, Wikicode)): if isinstance(obj, (Node, Wikicode)):
context, index = self._do_strong_search(obj, recursive) context, index = self._do_strong_search(obj, recursive)
for i in range(index.start, index.stop):
for _ in range(index.start, index.stop):
context.nodes.pop(index.start) context.nodes.pop(index.start)
context.insert(index.start, value) context.insert(index.start, value)
else: else:
for exact, context, index in self._do_weak_search(obj, recursive): for exact, context, index in self._do_weak_search(obj, recursive):
if exact: if exact:
for i in range(index.start, index.stop):
for _ in range(index.start, index.stop):
context.nodes.pop(index.start) context.nodes.pop(index.start)
context.insert(index.start, value) context.insert(index.start, value)
else: else:
@@ -482,12 +478,12 @@ class Wikicode(StringMixIn):
""" """
if isinstance(obj, (Node, Wikicode)): if isinstance(obj, (Node, Wikicode)):
context, index = self._do_strong_search(obj, recursive) context, index = self._do_strong_search(obj, recursive)
for i in range(index.start, index.stop):
for _ in range(index.start, index.stop):
context.nodes.pop(index.start) context.nodes.pop(index.start)
else: else:
for exact, context, index in self._do_weak_search(obj, recursive): for exact, context, index in self._do_weak_search(obj, recursive):
if exact: if exact:
for i in range(index.start, index.stop):
for _ in range(index.start, index.stop):
context.nodes.pop(index.start) context.nodes.pop(index.start)
else: else:
self._slice_replace(context, index, str(obj), "") self._slice_replace(context, index, str(obj), "")
@@ -649,8 +645,7 @@ class Wikicode(StringMixIn):
while "\n\n\n" in stripped: while "\n\n\n" in stripped:
stripped = stripped.replace("\n\n\n", "\n\n") stripped = stripped.replace("\n\n\n", "\n\n")
return stripped return stripped
else:
return "".join(nodes)
return "".join(nodes)


def get_tree(self): def get_tree(self):
"""Return a hierarchical tree representation of the object. """Return a hierarchical tree representation of the object.


+ 7
- 16
scripts/memtest.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -21,10 +19,10 @@
# SOFTWARE. # SOFTWARE.


""" """
Tests for memory leaks in the CTokenizer. Python 2 and 3 compatible.
Tests for memory leaks in the CTokenizer.


This appears to work mostly fine under Linux, but gives an absurd number of This appears to work mostly fine under Linux, but gives an absurd number of
false positives on OS X. I'm not sure why. Running the tests multiple times
false positives on macOS. I'm not sure why. Running the tests multiple times
yields different results (tests don't always leak, and the amount they leak by yields different results (tests don't always leak, and the amount they leak by
varies). Increasing the number of loops results in a smaller bytes/loop value, varies). Increasing the number of loops results in a smaller bytes/loop value,
too, indicating the increase in memory usage might be due to something else. too, indicating the increase in memory usage might be due to something else.
@@ -32,7 +30,6 @@ Actual memory leaks typically leak very large amounts of memory (megabytes)
and scale with the number of loops. and scale with the number of loops.
""" """


from __future__ import unicode_literals, print_function
from locale import LC_ALL, setlocale from locale import LC_ALL, setlocale
from multiprocessing import Process, Pipe from multiprocessing import Process, Pipe
from os import listdir, path from os import listdir, path
@@ -40,22 +37,18 @@ import sys


import psutil import psutil


from mwparserfromhell.compat import py3k
from mwparserfromhell.parser._tokenizer import CTokenizer from mwparserfromhell.parser._tokenizer import CTokenizer


if sys.version_info[0] == 2:
range = xrange

LOOPS = 10000 LOOPS = 10000


class Color(object):
class Color:
GRAY = "\x1b[30;1m" GRAY = "\x1b[30;1m"
GREEN = "\x1b[92m" GREEN = "\x1b[92m"
YELLOW = "\x1b[93m" YELLOW = "\x1b[93m"
RESET = "\x1b[0m" RESET = "\x1b[0m"




class MemoryTest(object):
class MemoryTest:
"""Manages a memory test.""" """Manages a memory test."""


def __init__(self): def __init__(self):
@@ -88,8 +81,6 @@ class MemoryTest(object):
def load_file(filename): def load_file(filename):
with open(filename, "rU") as fp: with open(filename, "rU") as fp:
text = fp.read() text = fp.read()
if not py3k:
text = text.decode("utf8")
name = path.split(filename)[1][:0-len(extension)] name = path.split(filename)[1][:0-len(extension)]
self._parse_file(name, text) self._parse_file(name, text)


@@ -154,13 +145,13 @@ class MemoryTest(object):


def _runner(text, child): def _runner(text, child):
r1, r2 = range(250), range(LOOPS) r1, r2 = range(250), range(LOOPS)
for i in r1:
for _ in r1:
CTokenizer().tokenize(text) CTokenizer().tokenize(text)
child.send("OK") child.send("OK")
child.recv() child.recv()
child.send("OK") child.send("OK")
child.recv() child.recv()
for i in r2:
for _ in r2:
CTokenizer().tokenize(text) CTokenizer().tokenize(text)
child.send("OK") child.send("OK")
child.recv() child.recv()


+ 4
- 12
setup.py View File

@@ -1,7 +1,6 @@
#! /usr/bin/env python #! /usr/bin/env python
# -*- coding: utf-8 -*-
# #
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -21,23 +20,17 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import print_function
from distutils.errors import DistutilsError, CCompilerError from distutils.errors import DistutilsError, CCompilerError
from glob import glob from glob import glob
from os import environ from os import environ
import sys import sys


if ((sys.version_info[0] == 2 and sys.version_info[1] < 7) or
(sys.version_info[1] == 3 and sys.version_info[1] < 4)):
raise RuntimeError("mwparserfromhell needs Python 2.7 or 3.4+")

from setuptools import setup, find_packages, Extension from setuptools import setup, find_packages, Extension
from setuptools.command.build_ext import build_ext from setuptools.command.build_ext import build_ext


from mwparserfromhell import __version__ from mwparserfromhell import __version__
from mwparserfromhell.compat import py3k


with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp:
with open("README.rst") as fp:
long_docs = fp.read() long_docs = fp.read()


use_extension = True use_extension = True
@@ -84,6 +77,7 @@ setup(
ext_modules = [tokenizer] if use_extension else [], ext_modules = [tokenizer] if use_extension else [],
test_suite = "tests", test_suite = "tests",
version = __version__, version = __version__,
python_requires = ">= 3.5",
author = "Ben Kurtovic", author = "Ben Kurtovic",
author_email = "ben.kurtovic@gmail.com", author_email = "ben.kurtovic@gmail.com",
url = "https://github.com/earwig/mwparserfromhell", url = "https://github.com/earwig/mwparserfromhell",
@@ -98,14 +92,12 @@ setup(
"Intended Audience :: Developers", "Intended Audience :: Developers",
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Operating System :: OS Independent", "Operating System :: OS Independent",
"Programming Language :: Python :: 2",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Text Processing :: Markup" "Topic :: Text Processing :: Markup"
], ],
) )

+ 0
- 1
tests/__init__.py View File

@@ -1 +0,0 @@
# -*- coding: utf-8 -*-

+ 5
- 12
tests/_test_tokenizer.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,22 +18,19 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import codecs import codecs
from os import listdir, path from os import listdir, path
import sys import sys
import warnings import warnings


from mwparserfromhell.compat import py3k, str
from mwparserfromhell.parser import tokens from mwparserfromhell.parser import tokens
from mwparserfromhell.parser.builder import Builder from mwparserfromhell.parser.builder import Builder


class _TestParseError(Exception): class _TestParseError(Exception):
"""Raised internally when a test could not be parsed.""" """Raised internally when a test could not be parsed."""
pass




class TokenizerTestCase(object):
class TokenizerTestCase:
"""A base test case for tokenizers, whose tests are loaded dynamically. """A base test case for tokenizers, whose tests are loaded dynamically.


Subclassed along with unittest.TestCase to form TestPyTokenizer and Subclassed along with unittest.TestCase to form TestPyTokenizer and
@@ -44,7 +39,7 @@ class TokenizerTestCase(object):
""" """


@staticmethod @staticmethod
def _build_test_method(funcname, data):
def _build_test_method(data):
"""Create and return a method to be treated as a test case method. """Create and return a method to be treated as a test case method.


*data* is a dict containing multiple keys: the *input* text to be *data* is a dict containing multiple keys: the *input* text to be
@@ -60,8 +55,6 @@ class TokenizerTestCase(object):
actual = self.tokenizer().tokenize(data["input"]) actual = self.tokenizer().tokenize(data["input"])
self.assertEqual(expected, actual) self.assertEqual(expected, actual)


if not py3k:
inner.__name__ = funcname.encode("utf8")
inner.__doc__ = data["label"] inner.__doc__ = data["label"]
return inner return inner


@@ -84,7 +77,7 @@ class TokenizerTestCase(object):
try: try:
data["output"] = eval(raw, vars(tokens)) data["output"] = eval(raw, vars(tokens))
except Exception as err: except Exception as err:
raise _TestParseError(err)
raise _TestParseError(err) from err


@classmethod @classmethod
def _load_tests(cls, filename, name, text, restrict=None): def _load_tests(cls, filename, name, text, restrict=None):
@@ -120,7 +113,7 @@ class TokenizerTestCase(object):
continue continue


fname = "test_{}{}_{}".format(name, number, data["name"]) fname = "test_{}{}_{}".format(name, number, data["name"])
meth = cls._build_test_method(fname, data)
meth = cls._build_test_method(data)
setattr(cls, fname, meth) setattr(cls, fname, meth)


@classmethod @classmethod


+ 1
- 6
tests/_test_tree_equality.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,13 +18,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
from unittest import TestCase from unittest import TestCase


from mwparserfromhell.compat import range
from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity,
Tag, Template, Text, Wikilink) Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes.extras import Attribute, Parameter
from mwparserfromhell.smart_list import SmartList from mwparserfromhell.smart_list import SmartList
from mwparserfromhell.wikicode import Wikicode from mwparserfromhell.wikicode import Wikicode




+ 0
- 18
tests/compat.py View File

@@ -1,18 +0,0 @@
# -*- coding: utf-8 -*-

"""
Serves the same purpose as mwparserfromhell.compat, but only for objects
required by unit tests. This avoids unnecessary imports (like urllib) within
the main library.
"""

from mwparserfromhell.compat import py3k

if py3k:
from io import StringIO
from urllib.parse import urlencode
from urllib.request import urlopen

else:
from StringIO import StringIO
from urllib import urlencode, urlopen

+ 3
- 7
tests/test_argument.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Argument, Text from mwparserfromhell.nodes import Argument, Text


from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -31,8 +27,8 @@ from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
class TestArgument(TreeEqualityTestCase): class TestArgument(TreeEqualityTestCase):
"""Test cases for the Argument node.""" """Test cases for the Argument node."""


def test_unicode(self):
"""test Argument.__unicode__()"""
def test_str(self):
"""test Argument.__str__()"""
node = Argument(wraptext("foobar")) node = Argument(wraptext("foobar"))
self.assertEqual("{{{foobar}}}", str(node)) self.assertEqual("{{{foobar}}}", str(node))
node2 = Argument(wraptext("foo"), wraptext("bar")) node2 = Argument(wraptext("foo"), wraptext("bar"))


+ 3
- 7
tests/test_attribute.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Template from mwparserfromhell.nodes import Template
from mwparserfromhell.nodes.extras import Attribute from mwparserfromhell.nodes.extras import Attribute


@@ -32,8 +28,8 @@ from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
class TestAttribute(TreeEqualityTestCase): class TestAttribute(TreeEqualityTestCase):
"""Test cases for the Attribute node extra.""" """Test cases for the Attribute node extra."""


def test_unicode(self):
"""test Attribute.__unicode__()"""
def test_str(self):
"""test Attribute.__str__()"""
node = Attribute(wraptext("foo")) node = Attribute(wraptext("foo"))
self.assertEqual(" foo", str(node)) self.assertEqual(" foo", str(node))
node2 = Attribute(wraptext("foo"), wraptext("bar")) node2 = Attribute(wraptext("foo"), wraptext("bar"))


+ 1
- 6
tests/test_builder.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> # Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import py3k
from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading,
HTMLEntity, Tag, Template, Text, Wikilink) HTMLEntity, Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes.extras import Attribute, Parameter from mwparserfromhell.nodes.extras import Attribute, Parameter
@@ -428,9 +424,8 @@ class TestBuilder(TreeEqualityTestCase):
[tokens.TagOpenOpen()] [tokens.TagOpenOpen()]
] ]


func = self.assertRaisesRegex if py3k else self.assertRaisesRegexp
msg = r"_handle_token\(\) got unexpected TemplateClose" msg = r"_handle_token\(\) got unexpected TemplateClose"
func(ParserError, msg, self.builder.build, [tokens.TemplateClose()])
self.assertRaisesRegex(ParserError, msg, self.builder.build, [tokens.TemplateClose()])
for test in missing_closes: for test in missing_closes:
self.assertRaises(ParserError, self.builder.build, test) self.assertRaises(ParserError, self.builder.build, test)




+ 3
- 7
tests/test_comment.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Comment from mwparserfromhell.nodes import Comment


from ._test_tree_equality import TreeEqualityTestCase from ._test_tree_equality import TreeEqualityTestCase
@@ -31,8 +27,8 @@ from ._test_tree_equality import TreeEqualityTestCase
class TestComment(TreeEqualityTestCase): class TestComment(TreeEqualityTestCase):
"""Test cases for the Comment node.""" """Test cases for the Comment node."""


def test_unicode(self):
"""test Comment.__unicode__()"""
def test_str(self):
"""test Comment.__str__()"""
node = Comment("foobar") node = Comment("foobar")
self.assertEqual("<!--foobar-->", str(node)) self.assertEqual("<!--foobar-->", str(node))




+ 0
- 3
tests/test_ctokenizer.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -20,7 +18,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


try: try:


+ 15
- 34
tests/test_docs.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,23 +18,22 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import print_function, unicode_literals
import json import json
from io import StringIO
import os import os
import unittest import unittest
from urllib.parse import urlencode
from urllib.request import urlopen


import mwparserfromhell import mwparserfromhell
from mwparserfromhell.compat import py3k, str

from .compat import StringIO, urlencode, urlopen


class TestDocs(unittest.TestCase): class TestDocs(unittest.TestCase):
"""Integration test cases for mwparserfromhell's documentation.""" """Integration test cases for mwparserfromhell's documentation."""


def assertPrint(self, input, output):
"""Assertion check that *input*, when printed, produces *output*."""
def assertPrint(self, value, output):
"""Assertion check that *value*, when printed, produces *output*."""
buff = StringIO() buff = StringIO()
print(input, end="", file=buff)
print(value, end="", file=buff)
buff.seek(0) buff.seek(0)
self.assertEqual(output, buff.read()) self.assertEqual(output, buff.read())


@@ -47,16 +44,10 @@ class TestDocs(unittest.TestCase):
self.assertPrint(wikicode, self.assertPrint(wikicode,
"I has a template! {{foo|bar|baz|eggs=spam}} See it?") "I has a template! {{foo|bar|baz|eggs=spam}} See it?")
templates = wikicode.filter_templates() templates = wikicode.filter_templates()
if py3k:
self.assertPrint(templates, "['{{foo|bar|baz|eggs=spam}}']")
else:
self.assertPrint(templates, "[u'{{foo|bar|baz|eggs=spam}}']")
self.assertPrint(templates, "['{{foo|bar|baz|eggs=spam}}']")
template = templates[0] template = templates[0]
self.assertPrint(template.name, "foo") self.assertPrint(template.name, "foo")
if py3k:
self.assertPrint(template.params, "['bar', 'baz', 'eggs=spam']")
else:
self.assertPrint(template.params, "[u'bar', u'baz', u'eggs=spam']")
self.assertPrint(template.params, "['bar', 'baz', 'eggs=spam']")
self.assertPrint(template.get(1).value, "bar") self.assertPrint(template.get(1).value, "bar")
self.assertPrint(template.get("eggs").value, "spam") self.assertPrint(template.get("eggs").value, "spam")


@@ -64,21 +55,14 @@ class TestDocs(unittest.TestCase):
"""test a block of example code in the README""" """test a block of example code in the README"""
text = "{{foo|{{bar}}={{baz|{{spam}}}}}}" text = "{{foo|{{bar}}={{baz|{{spam}}}}}}"
temps = mwparserfromhell.parse(text).filter_templates() temps = mwparserfromhell.parse(text).filter_templates()
if py3k:
res = "['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}']"
else:
res = "[u'{{foo|{{bar}}={{baz|{{spam}}}}}}', u'{{bar}}', u'{{baz|{{spam}}}}', u'{{spam}}']"
res = "['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}']"
self.assertPrint(temps, res) self.assertPrint(temps, res)


def test_readme_3(self): def test_readme_3(self):
"""test a block of example code in the README""" """test a block of example code in the README"""
code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}") code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}")
if py3k:
self.assertPrint(code.filter_templates(recursive=False),
"['{{foo|this {{includes a|template}}}}']")
else:
self.assertPrint(code.filter_templates(recursive=False),
"[u'{{foo|this {{includes a|template}}}}']")
self.assertPrint(code.filter_templates(recursive=False),
"['{{foo|this {{includes a|template}}}}']")
foo = code.filter_templates(recursive=False)[0] foo = code.filter_templates(recursive=False)[0]
self.assertPrint(foo.get(1).value, "this {{includes a|template}}") self.assertPrint(foo.get(1).value, "this {{includes a|template}}")
self.assertPrint(foo.get(1).value.filter_templates()[0], self.assertPrint(foo.get(1).value.filter_templates()[0],
@@ -98,10 +82,7 @@ class TestDocs(unittest.TestCase):
code.replace("{{uncategorized}}", "{{bar-stub}}") code.replace("{{uncategorized}}", "{{bar-stub}}")
res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}" res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}"
self.assertPrint(code, res) self.assertPrint(code, res)
if py3k:
res = "['{{cleanup|date=July 2012}}', '{{bar-stub}}']"
else:
res = "[u'{{cleanup|date=July 2012}}', u'{{bar-stub}}']"
res = "['{{cleanup|date=July 2012}}', '{{bar-stub}}']"
self.assertPrint(code.filter_templates(), res) self.assertPrint(code.filter_templates(), res)
text = str(code) text = str(code)
res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}" res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}"
@@ -126,14 +107,14 @@ class TestDocs(unittest.TestCase):
} }
try: try:
raw = urlopen(url1, urlencode(data).encode("utf8")).read() raw = urlopen(url1, urlencode(data).encode("utf8")).read()
except IOError:
except OSError:
self.skipTest("cannot continue because of unsuccessful web call") self.skipTest("cannot continue because of unsuccessful web call")
res = json.loads(raw.decode("utf8")) res = json.loads(raw.decode("utf8"))
revision = res["query"]["pages"][0]["revisions"][0] revision = res["query"]["pages"][0]["revisions"][0]
text = revision["slots"]["main"]["content"] text = revision["slots"]["main"]["content"]
try: try:
expected = urlopen(url2.format(title)).read().decode("utf8") expected = urlopen(url2.format(title)).read().decode("utf8")
except IOError:
except OSError:
self.skipTest("cannot continue because of unsuccessful web call") self.skipTest("cannot continue because of unsuccessful web call")
actual = mwparserfromhell.parse(text) actual = mwparserfromhell.parse(text)
self.assertEqual(expected, actual) self.assertEqual(expected, actual)


+ 3
- 7
tests/test_external_link.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import ExternalLink, Text from mwparserfromhell.nodes import ExternalLink, Text


from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -31,8 +27,8 @@ from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
class TestExternalLink(TreeEqualityTestCase): class TestExternalLink(TreeEqualityTestCase):
"""Test cases for the ExternalLink node.""" """Test cases for the ExternalLink node."""


def test_unicode(self):
"""test ExternalLink.__unicode__()"""
def test_str(self):
"""test ExternalLink.__str__()"""
node = ExternalLink(wraptext("http://example.com/"), brackets=False) node = ExternalLink(wraptext("http://example.com/"), brackets=False)
self.assertEqual("http://example.com/", str(node)) self.assertEqual("http://example.com/", str(node))
node2 = ExternalLink(wraptext("http://example.com/")) node2 = ExternalLink(wraptext("http://example.com/"))


+ 3
- 7
tests/test_heading.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Heading, Text from mwparserfromhell.nodes import Heading, Text


from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -31,8 +27,8 @@ from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
class TestHeading(TreeEqualityTestCase): class TestHeading(TreeEqualityTestCase):
"""Test cases for the Heading node.""" """Test cases for the Heading node."""


def test_unicode(self):
"""test Heading.__unicode__()"""
def test_str(self):
"""test Heading.__str__()"""
node = Heading(wraptext("foobar"), 2) node = Heading(wraptext("foobar"), 2)
self.assertEqual("==foobar==", str(node)) self.assertEqual("==foobar==", str(node))
node2 = Heading(wraptext(" zzz "), 5) node2 = Heading(wraptext(" zzz "), 5)


+ 4
- 8
tests/test_html_entity.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,19 +18,17 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import HTMLEntity from mwparserfromhell.nodes import HTMLEntity


from ._test_tree_equality import TreeEqualityTestCase, wrap
from ._test_tree_equality import TreeEqualityTestCase


class TestHTMLEntity(TreeEqualityTestCase): class TestHTMLEntity(TreeEqualityTestCase):
"""Test cases for the HTMLEntity node.""" """Test cases for the HTMLEntity node."""


def test_unicode(self):
"""test HTMLEntity.__unicode__()"""
def test_str(self):
"""test HTMLEntity.__str__()"""
node1 = HTMLEntity("nbsp", named=True, hexadecimal=False) node1 = HTMLEntity("nbsp", named=True, hexadecimal=False)
node2 = HTMLEntity("107", named=False, hexadecimal=False) node2 = HTMLEntity("107", named=False, hexadecimal=False)
node3 = HTMLEntity("6b", named=False, hexadecimal=True) node3 = HTMLEntity("6b", named=False, hexadecimal=True)


+ 4
- 9
tests/test_parameter.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,20 +18,17 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Text
from mwparserfromhell.nodes.extras import Parameter from mwparserfromhell.nodes.extras import Parameter


from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
from ._test_tree_equality import TreeEqualityTestCase, wraptext


class TestParameter(TreeEqualityTestCase): class TestParameter(TreeEqualityTestCase):
"""Test cases for the Parameter node extra.""" """Test cases for the Parameter node extra."""


def test_unicode(self):
"""test Parameter.__unicode__()"""
def test_str(self):
"""test Parameter.__str__()"""
node = Parameter(wraptext("1"), wraptext("foo"), showkey=False) node = Parameter(wraptext("1"), wraptext("foo"), showkey=False)
self.assertEqual("foo", str(node)) self.assertEqual("foo", str(node))
node2 = Parameter(wraptext("foo"), wraptext("bar")) node2 = Parameter(wraptext("foo"), wraptext("bar"))


+ 0
- 4
tests/test_parser.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -20,11 +18,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell import parser from mwparserfromhell import parser
from mwparserfromhell.compat import range
from mwparserfromhell.nodes import Tag, Template, Text, Wikilink from mwparserfromhell.nodes import Tag, Template, Text, Wikilink
from mwparserfromhell.nodes.extras import Parameter from mwparserfromhell.nodes.extras import Parameter




+ 0
- 3
tests/test_pytokenizer.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> # Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -20,7 +18,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.parser import contexts from mwparserfromhell.parser import contexts


+ 0
- 3
tests/test_roundtripping.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -20,7 +18,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from ._test_tokenizer import TokenizerTestCase from ._test_tokenizer import TokenizerTestCase


+ 17
- 29
tests/test_smart_list.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,14 +18,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import py3k, range
from mwparserfromhell.smart_list import SmartList, _ListProxy
from mwparserfromhell.smart_list import SmartList
from mwparserfromhell.smart_list.list_proxy import ListProxy



class TestSmartList(unittest.TestCase): class TestSmartList(unittest.TestCase):
"""Test cases for the SmartList class and its child, _ListProxy."""
"""Test cases for the SmartList class and its child, ListProxy."""


def _test_get_set_del_item(self, builder): def _test_get_set_del_item(self, builder):
"""Run tests on __get/set/delitem__ of a list built with *builder*.""" """Run tests on __get/set/delitem__ of a list built with *builder*."""
@@ -126,14 +124,9 @@ class TestSmartList(unittest.TestCase):
list3 = builder([0, 2, 3, 4]) list3 = builder([0, 2, 3, 4])
list4 = builder([0, 1, 2]) list4 = builder([0, 1, 2])


if py3k:
self.assertEqual("[0, 1, 2, 3, 'one', 'two']", str(list1))
self.assertEqual(b"\x00\x01\x02", bytes(list4))
self.assertEqual("[0, 1, 2, 3, 'one', 'two']", repr(list1))
else:
self.assertEqual("[0, 1, 2, 3, u'one', u'two']", unicode(list1))
self.assertEqual(b"[0, 1, 2, 3, u'one', u'two']", str(list1))
self.assertEqual(b"[0, 1, 2, 3, u'one', u'two']", repr(list1))
self.assertEqual("[0, 1, 2, 3, 'one', 'two']", str(list1))
self.assertEqual(b"\x00\x01\x02", bytes(list4))
self.assertEqual("[0, 1, 2, 3, 'one', 'two']", repr(list1))


self.assertLess(list1, list3) self.assertLess(list1, list3)
self.assertLessEqual(list1, list3) self.assertLessEqual(list1, list3)
@@ -184,7 +177,7 @@ class TestSmartList(unittest.TestCase):


gen1 = iter(list1) gen1 = iter(list1)
out = [] out = []
for i in range(len(list1)):
for _ in range(len(list1)):
out.append(next(gen1)) out.append(next(gen1))
self.assertRaises(StopIteration, next, gen1) self.assertRaises(StopIteration, next, gen1)
self.assertEqual([0, 1, 2, 3, "one", "two"], out) self.assertEqual([0, 1, 2, 3, "one", "two"], out)
@@ -261,18 +254,13 @@ class TestSmartList(unittest.TestCase):
self.assertEqual([0, 2, 2, 3, 4, 5], list1) self.assertEqual([0, 2, 2, 3, 4, 5], list1)
list1.sort(reverse=True) list1.sort(reverse=True)
self.assertEqual([5, 4, 3, 2, 2, 0], list1) self.assertEqual([5, 4, 3, 2, 2, 0], list1)
if not py3k:
func = lambda x, y: abs(3 - x) - abs(3 - y) # Distance from 3
list1.sort(cmp=func)
self.assertEqual([3, 4, 2, 2, 5, 0], list1)
list1.sort(cmp=func, reverse=True)
self.assertEqual([0, 5, 4, 2, 2, 3], list1)
list3.sort(key=lambda i: i[1]) list3.sort(key=lambda i: i[1])
self.assertEqual([("d", 2), ("c", 3), ("a", 5), ("b", 8)], list3) self.assertEqual([("d", 2), ("c", 3), ("a", 5), ("b", 8)], list3)
list3.sort(key=lambda i: i[1], reverse=True) list3.sort(key=lambda i: i[1], reverse=True)
self.assertEqual([("b", 8), ("a", 5), ("c", 3), ("d", 2)], list3) self.assertEqual([("b", 8), ("a", 5), ("c", 3), ("d", 2)], list3)


def _dispatch_test_for_children(self, meth):
@staticmethod
def _dispatch_test_for_children(meth):
"""Run a test method on various different types of children.""" """Run a test method on various different types of children."""
meth(lambda L: SmartList(list(L))[:]) meth(lambda L: SmartList(list(L))[:])
meth(lambda L: SmartList([999] + list(L))[1:]) meth(lambda L: SmartList([999] + list(L))[1:])
@@ -280,13 +268,13 @@ class TestSmartList(unittest.TestCase):
meth(lambda L: SmartList([101, 102] + list(L) + [201, 202])[2:-2]) meth(lambda L: SmartList([101, 102] + list(L) + [201, 202])[2:-2])


def test_docs(self): def test_docs(self):
"""make sure the methods of SmartList/_ListProxy have docstrings"""
"""make sure the methods of SmartList/ListProxy have docstrings"""
methods = ["append", "count", "extend", "index", "insert", "pop", methods = ["append", "count", "extend", "index", "insert", "pop",
"remove", "reverse", "sort"] "remove", "reverse", "sort"]
for meth in methods: for meth in methods:
expected = getattr(list, meth).__doc__ expected = getattr(list, meth).__doc__
smartlist_doc = getattr(SmartList, meth).__doc__ smartlist_doc = getattr(SmartList, meth).__doc__
listproxy_doc = getattr(_ListProxy, meth).__doc__
listproxy_doc = getattr(ListProxy, meth).__doc__
self.assertEqual(expected, smartlist_doc) self.assertEqual(expected, smartlist_doc)
self.assertEqual(expected, listproxy_doc) self.assertEqual(expected, listproxy_doc)


@@ -317,19 +305,19 @@ class TestSmartList(unittest.TestCase):
self._test_list_methods(SmartList) self._test_list_methods(SmartList)


def test_child_get_set_del(self): def test_child_get_set_del(self):
"""make sure _ListProxy's getitem/setitem/delitem work"""
"""make sure ListProxy's getitem/setitem/delitem work"""
self._dispatch_test_for_children(self._test_get_set_del_item) self._dispatch_test_for_children(self._test_get_set_del_item)


def test_child_add(self): def test_child_add(self):
"""make sure _ListProxy's add/radd/iadd work"""
"""make sure ListProxy's add/radd/iadd work"""
self._dispatch_test_for_children(self._test_add_radd_iadd) self._dispatch_test_for_children(self._test_add_radd_iadd)


def test_child_other_magics(self): def test_child_other_magics(self):
"""make sure _ListProxy's other magically implemented features work"""
"""make sure ListProxy's other magically implemented features work"""
self._dispatch_test_for_children(self._test_other_magic_methods) self._dispatch_test_for_children(self._test_other_magic_methods)


def test_child_methods(self): def test_child_methods(self):
"""make sure _ListProxy's non-magic methods work, like append()"""
"""make sure ListProxy's non-magic methods work, like append()"""
self._dispatch_test_for_children(self._test_list_methods) self._dispatch_test_for_children(self._test_list_methods)


def test_influence(self): def test_influence(self):


+ 42
- 72
tests/test_string_mixin.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,19 +18,17 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
from sys import getdefaultencoding from sys import getdefaultencoding
from types import GeneratorType from types import GeneratorType
import unittest import unittest


from mwparserfromhell.compat import bytes, py3k, range, str
from mwparserfromhell.string_mixin import StringMixIn from mwparserfromhell.string_mixin import StringMixIn


class _FakeString(StringMixIn): class _FakeString(StringMixIn):
def __init__(self, data): def __init__(self, data):
self._data = data self._data = data


def __unicode__(self):
def __str__(self):
return self._data return self._data




@@ -42,18 +38,16 @@ class TestStringMixIn(unittest.TestCase):
def test_docs(self): def test_docs(self):
"""make sure the various methods of StringMixIn have docstrings""" """make sure the various methods of StringMixIn have docstrings"""
methods = [ methods = [
"capitalize", "center", "count", "encode", "endswith",
"expandtabs", "find", "format", "index", "isalnum", "isalpha",
"isdecimal", "isdigit", "islower", "isnumeric", "isspace",
"istitle", "isupper", "join", "ljust", "lower", "lstrip",
"partition", "replace", "rfind", "rindex", "rjust", "rpartition",
"rsplit", "rstrip", "split", "splitlines", "startswith", "strip",
"swapcase", "title", "translate", "upper", "zfill"]
if py3k:
methods.extend(["casefold", "format_map", "isidentifier", "isprintable",
"maketrans"])
else:
methods.append("decode")
"capitalize", "casefold", "center", "count", "encode", "endswith",
"expandtabs", "find", "format", "format_map", "index", "isalnum",
"isalpha", "isdecimal", "isdigit", "isidentifier", "islower",
"isnumeric", "isprintable", "isspace", "istitle", "isupper",
"join", "ljust", "lower", "lstrip", "maketrans", "partition",
"replace", "rfind", "rindex", "rjust", "rpartition", "rsplit",
"rstrip", "split", "splitlines", "startswith", "strip", "swapcase",
"title", "translate", "upper", "zfill"
]

for meth in methods: for meth in methods:
expected = getattr("foo", meth).__doc__ expected = getattr("foo", meth).__doc__
actual = getattr(_FakeString("foo"), meth).__doc__ actual = getattr(_FakeString("foo"), meth).__doc__
@@ -64,17 +58,11 @@ class TestStringMixIn(unittest.TestCase):
fstr = _FakeString("fake string") fstr = _FakeString("fake string")
self.assertEqual(str(fstr), "fake string") self.assertEqual(str(fstr), "fake string")
self.assertEqual(bytes(fstr), b"fake string") self.assertEqual(bytes(fstr), b"fake string")
if py3k:
self.assertEqual(repr(fstr), "'fake string'")
else:
self.assertEqual(repr(fstr), b"u'fake string'")
self.assertEqual(repr(fstr), "'fake string'")


self.assertIsInstance(str(fstr), str) self.assertIsInstance(str(fstr), str)
self.assertIsInstance(bytes(fstr), bytes) self.assertIsInstance(bytes(fstr), bytes)
if py3k:
self.assertIsInstance(repr(fstr), str)
else:
self.assertIsInstance(repr(fstr), bytes)
self.assertIsInstance(repr(fstr), str)


def test_comparisons(self): def test_comparisons(self):
"""make sure comparison operators work""" """make sure comparison operators work"""
@@ -139,7 +127,7 @@ class TestStringMixIn(unittest.TestCase):
self.assertIsInstance(gen2, GeneratorType) self.assertIsInstance(gen2, GeneratorType)


out = [] out = []
for i in range(len(str1)):
for _ in range(len(str1)):
out.append(next(gen1)) out.append(next(gen1))
self.assertRaises(StopIteration, next, gen1) self.assertRaises(StopIteration, next, gen1)
self.assertEqual(expected, out) self.assertEqual(expected, out)
@@ -179,14 +167,6 @@ class TestStringMixIn(unittest.TestCase):
self.assertEqual(1, str1.count("r", 5, 9)) self.assertEqual(1, str1.count("r", 5, 9))
self.assertEqual(0, str1.count("r", 5, 7)) self.assertEqual(0, str1.count("r", 5, 7))


if not py3k:
str2 = _FakeString("fo")
self.assertEqual(str1, str1.decode())
actual = _FakeString("\\U00010332\\U0001033f\\U00010344")
self.assertEqual("𐌲𐌿𐍄", actual.decode("unicode_escape"))
self.assertRaises(UnicodeError, str2.decode, "punycode")
self.assertEqual("", str2.decode("punycode", "ignore"))

str3 = _FakeString("𐌲𐌿𐍄") str3 = _FakeString("𐌲𐌿𐍄")
actual = b"\xF0\x90\x8C\xB2\xF0\x90\x8C\xBF\xF0\x90\x8D\x84" actual = b"\xF0\x90\x8C\xB2\xF0\x90\x8C\xBF\xF0\x90\x8D\x84"
self.assertEqual(b"fake string", str1.encode()) self.assertEqual(b"fake string", str1.encode())
@@ -233,10 +213,9 @@ class TestStringMixIn(unittest.TestCase):
self.assertEqual("foobarbazbuzz", str7.format("bar", abc="baz")) self.assertEqual("foobarbazbuzz", str7.format("bar", abc="baz"))
self.assertRaises(IndexError, str8.format, "abc") self.assertRaises(IndexError, str8.format, "abc")


if py3k:
self.assertEqual("fake string", str1.format_map({}))
self.assertEqual("foobarbaz", str6.format_map({"abc": "bar"}))
self.assertRaises(ValueError, str5.format_map, {0: "abc"})
self.assertEqual("fake string", str1.format_map({}))
self.assertEqual("foobarbaz", str6.format_map({"abc": "bar"}))
self.assertRaises(ValueError, str5.format_map, {0: "abc"})


self.assertEqual(3, str1.index("e")) self.assertEqual(3, str1.index("e"))
self.assertRaises(ValueError, str1.index, "z") self.assertRaises(ValueError, str1.index, "z")
@@ -269,11 +248,10 @@ class TestStringMixIn(unittest.TestCase):
self.assertFalse(str13.isdigit()) self.assertFalse(str13.isdigit())
self.assertTrue(str14.isdigit()) self.assertTrue(str14.isdigit())


if py3k:
self.assertTrue(str9.isidentifier())
self.assertTrue(str10.isidentifier())
self.assertFalse(str11.isidentifier())
self.assertFalse(str12.isidentifier())
self.assertTrue(str9.isidentifier())
self.assertTrue(str10.isidentifier())
self.assertFalse(str11.isidentifier())
self.assertFalse(str12.isidentifier())


str15 = _FakeString("") str15 = _FakeString("")
str16 = _FakeString("FooBar") str16 = _FakeString("FooBar")
@@ -286,13 +264,12 @@ class TestStringMixIn(unittest.TestCase):
self.assertTrue(str13.isnumeric()) self.assertTrue(str13.isnumeric())
self.assertTrue(str14.isnumeric()) self.assertTrue(str14.isnumeric())


if py3k:
str16B = _FakeString("\x01\x02")
self.assertTrue(str9.isprintable())
self.assertTrue(str13.isprintable())
self.assertTrue(str14.isprintable())
self.assertTrue(str15.isprintable())
self.assertFalse(str16B.isprintable())
str16B = _FakeString("\x01\x02")
self.assertTrue(str9.isprintable())
self.assertTrue(str13.isprintable())
self.assertTrue(str14.isprintable())
self.assertTrue(str15.isprintable())
self.assertFalse(str16B.isprintable())


str17 = _FakeString(" ") str17 = _FakeString(" ")
str18 = _FakeString("\t \t \r\n") str18 = _FakeString("\t \t \r\n")
@@ -323,10 +300,9 @@ class TestStringMixIn(unittest.TestCase):
self.assertEqual("", str15.lower()) self.assertEqual("", str15.lower())
self.assertEqual("foobar", str16.lower()) self.assertEqual("foobar", str16.lower())
self.assertEqual("ß", str22.lower()) self.assertEqual("ß", str22.lower())
if py3k:
self.assertEqual("", str15.casefold())
self.assertEqual("foobar", str16.casefold())
self.assertEqual("ss", str22.casefold())
self.assertEqual("", str15.casefold())
self.assertEqual("foobar", str16.casefold())
self.assertEqual("ss", str22.casefold())


str23 = _FakeString(" fake string ") str23 = _FakeString(" fake string ")
self.assertEqual("fake string", str1.lstrip()) self.assertEqual("fake string", str1.lstrip())
@@ -372,9 +348,8 @@ class TestStringMixIn(unittest.TestCase):
self.assertEqual(actual, str25.rsplit(None, 3)) self.assertEqual(actual, str25.rsplit(None, 3))
actual = [" this is a sentence with", "", "whitespace", ""] actual = [" this is a sentence with", "", "whitespace", ""]
self.assertEqual(actual, str25.rsplit(" ", 3)) self.assertEqual(actual, str25.rsplit(" ", 3))
if py3k:
actual = [" this is a", "sentence", "with", "whitespace"]
self.assertEqual(actual, str25.rsplit(maxsplit=3))
actual = [" this is a", "sentence", "with", "whitespace"]
self.assertEqual(actual, str25.rsplit(maxsplit=3))


self.assertEqual("fake string", str1.rstrip()) self.assertEqual("fake string", str1.rstrip())
self.assertEqual(" fake string", str23.rstrip()) self.assertEqual(" fake string", str23.rstrip())
@@ -390,9 +365,8 @@ class TestStringMixIn(unittest.TestCase):
self.assertEqual(actual, str25.split(None, 3)) self.assertEqual(actual, str25.split(None, 3))
actual = ["", "", "", "this is a sentence with whitespace "] actual = ["", "", "", "this is a sentence with whitespace "]
self.assertEqual(actual, str25.split(" ", 3)) self.assertEqual(actual, str25.split(" ", 3))
if py3k:
actual = ["this", "is", "a", "sentence with whitespace "]
self.assertEqual(actual, str25.split(maxsplit=3))
actual = ["this", "is", "a", "sentence with whitespace "]
self.assertEqual(actual, str25.split(maxsplit=3))


str26 = _FakeString("lines\nof\ntext\r\nare\r\npresented\nhere") str26 = _FakeString("lines\nof\ntext\r\nare\r\npresented\nhere")
self.assertEqual(["lines", "of", "text", "are", "presented", "here"], self.assertEqual(["lines", "of", "text", "are", "presented", "here"],
@@ -411,17 +385,13 @@ class TestStringMixIn(unittest.TestCase):


self.assertEqual("Fake String", str1.title()) self.assertEqual("Fake String", str1.title())


if py3k:
table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3",
111: "4", 117: "5"})
table2 = StringMixIn.maketrans("aeiou", "12345")
table3 = StringMixIn.maketrans("aeiou", "12345", "rts")
self.assertEqual("f1k2 str3ng", str1.translate(table1))
self.assertEqual("f1k2 str3ng", str1.translate(table2))
self.assertEqual("f1k2 3ng", str1.translate(table3))
else:
table = {97: "1", 101: "2", 105: "3", 111: "4", 117: "5"}
self.assertEqual("f1k2 str3ng", str1.translate(table))
table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3",
111: "4", 117: "5"})
table2 = StringMixIn.maketrans("aeiou", "12345")
table3 = StringMixIn.maketrans("aeiou", "12345", "rts")
self.assertEqual("f1k2 str3ng", str1.translate(table1))
self.assertEqual("f1k2 str3ng", str1.translate(table2))
self.assertEqual("f1k2 3ng", str1.translate(table3))


self.assertEqual("", str15.upper()) self.assertEqual("", str15.upper())
self.assertEqual("FOOBAR", str16.upper()) self.assertEqual("FOOBAR", str16.upper())


+ 4
- 8
tests/test_tag.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Tag, Template, Text from mwparserfromhell.nodes import Tag, Template, Text
from mwparserfromhell.nodes.extras import Attribute from mwparserfromhell.nodes.extras import Attribute
from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -37,8 +33,8 @@ agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, '"', a, b, c)
class TestTag(TreeEqualityTestCase): class TestTag(TreeEqualityTestCase):
"""Test cases for the Tag node.""" """Test cases for the Tag node."""


def test_unicode(self):
"""test Tag.__unicode__()"""
def test_str(self):
"""test Tag.__str__()"""
node1 = Tag(wraptext("ref")) node1 = Tag(wraptext("ref"))
node2 = Tag(wraptext("span"), wraptext("foo"), node2 = Tag(wraptext("span"), wraptext("foo"),
[agen("style", "color: red;")]) [agen("style", "color: red;")])
@@ -230,7 +226,7 @@ class TestTag(TreeEqualityTestCase):
node.wiki_markup = "{" node.wiki_markup = "{"
self.assertEqual("{|\n{", node) self.assertEqual("{|\n{", node)
node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|") node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|")
self.assertEqual("|", node.wiki_style_separator)
self.assertEqual("|", node2.wiki_style_separator)


def test_closing_wiki_markup(self): def test_closing_wiki_markup(self):
"""test getter/setter for closing_wiki_markup attribute""" """test getter/setter for closing_wiki_markup attribute"""


+ 3
- 7
tests/test_template.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,11 +18,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
from difflib import unified_diff from difflib import unified_diff
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import HTMLEntity, Template, Text from mwparserfromhell.nodes import HTMLEntity, Template, Text
from mwparserfromhell.nodes.extras import Parameter from mwparserfromhell.nodes.extras import Parameter
from mwparserfromhell import parse from mwparserfromhell import parse
@@ -37,8 +33,8 @@ pgenh = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=False)
class TestTemplate(TreeEqualityTestCase): class TestTemplate(TreeEqualityTestCase):
"""Test cases for the Template node.""" """Test cases for the Template node."""


def test_unicode(self):
"""test Template.__unicode__()"""
def test_str(self):
"""test Template.__str__()"""
node = Template(wraptext("foobar")) node = Template(wraptext("foobar"))
self.assertEqual("{{foobar}}", str(node)) self.assertEqual("{{foobar}}", str(node))
node2 = Template(wraptext("foo"), node2 = Template(wraptext("foo"),


+ 3
- 7
tests/test_text.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,17 +18,15 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Text from mwparserfromhell.nodes import Text


class TestText(unittest.TestCase): class TestText(unittest.TestCase):
"""Test cases for the Text node.""" """Test cases for the Text node."""


def test_unicode(self):
"""test Text.__unicode__()"""
def test_str(self):
"""test Text.__str__()"""
node = Text("foobar") node = Text("foobar")
self.assertEqual("foobar", str(node)) self.assertEqual("foobar", str(node))
node2 = Text("fóóbar") node2 = Text("fóóbar")


+ 4
- 16
tests/test_tokens.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import py3k
from mwparserfromhell.parser import tokens from mwparserfromhell.parser import tokens


class TestTokens(unittest.TestCase): class TestTokens(unittest.TestCase):
@@ -64,17 +60,9 @@ class TestTokens(unittest.TestCase):
hundredchars = ("earwig" * 100)[:97] + "..." hundredchars = ("earwig" * 100)[:97] + "..."


self.assertEqual("Token()", repr(token1)) self.assertEqual("Token()", repr(token1))
if py3k:
token2repr1 = "Token(foo='bar', baz=123)"
token2repr2 = "Token(baz=123, foo='bar')"
token3repr = "Text(text='" + hundredchars + "')"
else:
token2repr1 = "Token(foo=u'bar', baz=123)"
token2repr2 = "Token(baz=123, foo=u'bar')"
token3repr = "Text(text=u'" + hundredchars + "')"
token2repr = repr(token2)
self.assertTrue(token2repr == token2repr1 or token2repr == token2repr2)
self.assertEqual(token3repr, repr(token3))
self.assertTrue(repr(token2) in (
"Token(foo='bar', baz=123)", "Token(baz=123, foo='bar')"))
self.assertEqual("Text(text='" + hundredchars + "')", repr(token3))


def test_equality(self): def test_equality(self):
"""check that equivalent tokens are considered equal""" """check that equivalent tokens are considered equal"""


+ 0
- 3
tests/test_utils.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -20,7 +18,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.nodes import Template, Text from mwparserfromhell.nodes import Template, Text


+ 4
- 9
tests/test_wikicode.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,15 +18,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
from functools import partial from functools import partial
import re import re
from types import GeneratorType from types import GeneratorType
import unittest import unittest


from mwparserfromhell.compat import py3k, str
from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity,
Node, Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes import Argument, Heading, Template, Text
from mwparserfromhell.smart_list import SmartList from mwparserfromhell.smart_list import SmartList
from mwparserfromhell.wikicode import Wikicode from mwparserfromhell.wikicode import Wikicode
from mwparserfromhell import parse from mwparserfromhell import parse
@@ -38,8 +33,8 @@ from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
class TestWikicode(TreeEqualityTestCase): class TestWikicode(TreeEqualityTestCase):
"""Tests for the Wikicode class, which manages a list of nodes.""" """Tests for the Wikicode class, which manages a list of nodes."""


def test_unicode(self):
"""test Wikicode.__unicode__()"""
def test_str(self):
"""test Wikicode.__str__()"""
code1 = parse("foobar") code1 = parse("foobar")
code2 = parse("Have a {{template}} and a [[page|link]]") code2 = parse("Have a {{template}} and a [[page|link]]")
self.assertEqual("foobar", str(code1)) self.assertEqual("foobar", str(code1))


+ 3
- 7
tests/test_wikilink.py View File

@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
import unittest import unittest


from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Text, Wikilink from mwparserfromhell.nodes import Text, Wikilink


from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
@@ -31,8 +27,8 @@ from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext
class TestWikilink(TreeEqualityTestCase): class TestWikilink(TreeEqualityTestCase):
"""Test cases for the Wikilink node.""" """Test cases for the Wikilink node."""


def test_unicode(self):
"""test Wikilink.__unicode__()"""
def test_str(self):
"""test Wikilink.__str__()"""
node = Wikilink(wraptext("foobar")) node = Wikilink(wraptext("foobar"))
self.assertEqual("[[foobar]]", str(node)) self.assertEqual("[[foobar]]", str(node))
node2 = Wikilink(wraptext("foo"), wraptext("bar")) node2 = Wikilink(wraptext("foo"), wraptext("bar"))


Loading…
Cancel
Save