diff --git a/.coveragerc b/.coveragerc index 48a64ce..909a0e2 100644 --- a/.coveragerc +++ b/.coveragerc @@ -6,4 +6,3 @@ partial_branches = pragma: no branch if py3k: if not py3k: - if py26: diff --git a/.gitignore b/.gitignore index 3da2db3..cf67fa0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ dist docs/_build scripts/*.log htmlcov/ +.idea/ diff --git a/.travis.yml b/.travis.yml index c0233d9..bee8152 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,20 +1,16 @@ +dist: xenial language: python python: - - 2.6 - - 2.7 - - 3.2 - - 3.3 - 3.4 - 3.5 - 3.6 - - nightly -sudo: false + - 3.7 + - 3.8 install: - - if [[ $TRAVIS_PYTHON_VERSION == '3.2' ]]; then pip install coverage==3.7.1; fi - pip install coveralls - - python setup.py build + - python setup.py develop script: - - coverage run --source=mwparserfromhell setup.py -q test + - coverage run --source=mwparserfromhell -m unittest discover after_success: - coveralls env: diff --git a/CHANGELOG b/CHANGELOG index 79a712b..53b3548 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,30 @@ v0.6 (unreleased): -- ... +- Added support for Python 3.8. +- Updated Wikicode.matches() to recognize underscores as being equivalent + to spaces. (#216) +- Fixed a rare parsing bug involving deeply nested style tags. (#224) + +v0.5.4 (released May 15, 2019): + +- Fixed an unlikely crash in the C tokenizer when interrupted while parsing + a heading. + +v0.5.3 (released March 30, 2019): + +- Fixed manual construction of Node objects, previously unsupported. (#214) +- Fixed Wikicode transformation methods (replace(), remove(), etc.) when passed + an empty section as an argument. (#212) +- Fixed the parser getting stuck inside malformed tables. (#206) + +v0.5.2 (released November 1, 2018): + +- Dropped support for end-of-life Python versions 2.6, 3.2, 3.3. (#199, #204) +- Fixed signals getting stuck inside the C tokenizer until parsing finishes, + in pathological cases. (#206) +- Fixed not being considered a single-only tag. (#200) +- Fixed a C tokenizer crash on Python 3.7 when compiled with assertions. (#208) +- Cleaned up some minor documentation issues. (#207) v0.5.1 (released March 3, 2018): diff --git a/LICENSE b/LICENSE index f353cd7..c846a0e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2012-2018 Ben Kurtovic +Copyright (C) 2012-2019 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.rst b/README.rst index 833ecea..98af7a4 100644 --- a/README.rst +++ b/README.rst @@ -3,7 +3,7 @@ mwparserfromhell .. image:: https://img.shields.io/travis/earwig/mwparserfromhell/develop.svg :alt: Build Status - :target: http://travis-ci.org/earwig/mwparserfromhell + :target: https://travis-ci.org/earwig/mwparserfromhell .. image:: https://img.shields.io/coveralls/earwig/mwparserfromhell/develop.svg :alt: Coverage Status @@ -11,7 +11,7 @@ mwparserfromhell **mwparserfromhell** (the *MediaWiki Parser from Hell*) is a Python package that provides an easy-to-use and outrageously powerful parser for MediaWiki_ -wikicode. It supports Python 2 and Python 3. +wikicode. It supports Python 3.4+. Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. Full documentation is available on ReadTheDocs_. Development occurs on GitHub_. @@ -30,88 +30,86 @@ Alternatively, get the latest development version:: python setup.py install You can run the comprehensive unit testing suite with -``python setup.py test -q``. +``python -m unittest discover``. Usage ----- -Normal usage is rather straightforward (where ``text`` is page text):: +Normal usage is rather straightforward (where ``text`` is page text): - >>> import mwparserfromhell - >>> wikicode = mwparserfromhell.parse(text) +>>> import mwparserfromhell +>>> wikicode = mwparserfromhell.parse(text) ``wikicode`` is a ``mwparserfromhell.Wikicode`` object, which acts like an -ordinary ``str`` object (or ``unicode`` in Python 2) with some extra methods. -For example:: - - >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" - >>> wikicode = mwparserfromhell.parse(text) - >>> print(wikicode) - I has a template! {{foo|bar|baz|eggs=spam}} See it? - >>> templates = wikicode.filter_templates() - >>> print(templates) - ['{{foo|bar|baz|eggs=spam}}'] - >>> template = templates[0] - >>> print(template.name) - foo - >>> print(template.params) - ['bar', 'baz', 'eggs=spam'] - >>> print(template.get(1).value) - bar - >>> print(template.get("eggs").value) - spam - -Since nodes can contain other nodes, getting nested templates is trivial:: - - >>> text = "{{foo|{{bar}}={{baz|{{spam}}}}}}" - >>> mwparserfromhell.parse(text).filter_templates() - ['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}'] +ordinary ``str`` object with some extra methods. +For example: + +>>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" +>>> wikicode = mwparserfromhell.parse(text) +>>> print(wikicode) +I has a template! {{foo|bar|baz|eggs=spam}} See it? +>>> templates = wikicode.filter_templates() +>>> print(templates) +['{{foo|bar|baz|eggs=spam}}'] +>>> template = templates[0] +>>> print(template.name) +foo +>>> print(template.params) +['bar', 'baz', 'eggs=spam'] +>>> print(template.get(1).value) +bar +>>> print(template.get("eggs").value) +spam + +Since nodes can contain other nodes, getting nested templates is trivial: + +>>> text = "{{foo|{{bar}}={{baz|{{spam}}}}}}" +>>> mwparserfromhell.parse(text).filter_templates() +['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}'] You can also pass ``recursive=False`` to ``filter_templates()`` and explore templates manually. This is possible because nodes can contain additional -``Wikicode`` objects:: - - >>> code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}") - >>> print(code.filter_templates(recursive=False)) - ['{{foo|this {{includes a|template}}}}'] - >>> foo = code.filter_templates(recursive=False)[0] - >>> print(foo.get(1).value) - this {{includes a|template}} - >>> print(foo.get(1).value.filter_templates()[0]) - {{includes a|template}} - >>> print(foo.get(1).value.filter_templates()[0].get(1).value) - template +``Wikicode`` objects: + +>>> code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}") +>>> print(code.filter_templates(recursive=False)) +['{{foo|this {{includes a|template}}}}'] +>>> foo = code.filter_templates(recursive=False)[0] +>>> print(foo.get(1).value) +this {{includes a|template}} +>>> print(foo.get(1).value.filter_templates()[0]) +{{includes a|template}} +>>> print(foo.get(1).value.filter_templates()[0].get(1).value) +template Templates can be easily modified to add, remove, or alter params. ``Wikicode`` objects can be treated like lists, with ``append()``, ``insert()``, ``remove()``, ``replace()``, and more. They also have a ``matches()`` method for comparing page or template names, which takes care of capitalization and -whitespace:: - - >>> text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}" - >>> code = mwparserfromhell.parse(text) - >>> for template in code.filter_templates(): - ... if template.name.matches("Cleanup") and not template.has("date"): - ... template.add("date", "July 2012") - ... - >>> print(code) - {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{uncategorized}} - >>> code.replace("{{uncategorized}}", "{{bar-stub}}") - >>> print(code) - {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} - >>> print(code.filter_templates()) - ['{{cleanup|date=July 2012}}', '{{bar-stub}}'] +whitespace: + +>>> text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}" +>>> code = mwparserfromhell.parse(text) +>>> for template in code.filter_templates(): +... if template.name.matches("Cleanup") and not template.has("date"): +... template.add("date", "July 2012") +... +>>> print(code) +{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{uncategorized}} +>>> code.replace("{{uncategorized}}", "{{bar-stub}}") +>>> print(code) +{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} +>>> print(code.filter_templates()) +['{{cleanup|date=July 2012}}', '{{bar-stub}}'] You can then convert ``code`` back into a regular ``str`` object (for -saving the page!) by calling ``str()`` on it:: +saving the page!) by calling ``str()`` on it: - >>> text = str(code) - >>> print(text) - {{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} - >>> text == code - True - -Likewise, use ``unicode(code)`` in Python 2. +>>> text = str(code) +>>> print(text) +{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}} +>>> text == code +True Limitations ----------- @@ -164,7 +162,9 @@ Integration ``Page`` objects have a ``parse`` method that essentially calls ``mwparserfromhell.parse()`` on ``page.get()``. -If you're using Pywikibot_, your code might look like this:: +If you're using Pywikibot_, your code might look like this: + +.. code-block:: python import mwparserfromhell import pywikibot @@ -175,32 +175,44 @@ If you're using Pywikibot_, your code might look like this:: text = page.get() return mwparserfromhell.parse(text) -If you're not using a library, you can parse any page using the following -Python 3 code (via the API_):: +If you're not using a library, you can parse any page with the following +Python 3 code (using the API_ and the requests_ library): + +.. code-block:: python - import json - from urllib.parse import urlencode - from urllib.request import urlopen import mwparserfromhell + import requests + API_URL = "https://en.wikipedia.org/w/api.php" def parse(title): - data = {"action": "query", "prop": "revisions", "rvlimit": 1, - "rvprop": "content", "format": "json", "titles": title} - raw = urlopen(API_URL, urlencode(data).encode()).read() - res = json.loads(raw) - text = res["query"]["pages"].values()[0]["revisions"][0]["*"] + params = { + "action": "query", + "prop": "revisions", + "rvprop": "content", + "rvslots": "main", + "rvlimit": 1, + "titles": title, + "format": "json", + "formatversion": "2", + } + headers = {"User-Agent": "My-Bot-Name/1.0"} + req = requests.get(API_URL, headers=headers, params=params) + res = req.json() + revision = res["query"]["pages"][0]["revisions"][0] + text = revision["slots"]["main"]["content"] return mwparserfromhell.parse(text) -.. _MediaWiki: http://mediawiki.org -.. _ReadTheDocs: http://mwparserfromhell.readthedocs.io -.. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig -.. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 -.. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm +.. _MediaWiki: https://www.mediawiki.org +.. _ReadTheDocs: https://mwparserfromhell.readthedocs.io +.. _Earwig: https://en.wikipedia.org/wiki/User:The_Earwig +.. _Σ: https://en.wikipedia.org/wiki/User:%CE%A3 +.. _Legoktm: https://en.wikipedia.org/wiki/User:Legoktm .. _GitHub: https://github.com/earwig/mwparserfromhell -.. _Python Package Index: http://pypi.python.org -.. _get pip: http://pypi.python.org/pypi/pip +.. _Python Package Index: https://pypi.org/ +.. _get pip: https://pypi.org/project/pip/ .. _Word-ending links: https://www.mediawiki.org/wiki/Help:Links#linktrail .. _EarwigBot: https://github.com/earwig/earwigbot .. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot -.. _API: http://mediawiki.org/wiki/API +.. _API: https://www.mediawiki.org/wiki/API:Main_page +.. _requests: https://2.python-requests.org diff --git a/appveyor.yml b/appveyor.yml index 70b71b4..2a4de47 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,29 +13,15 @@ environment: global: # See: http://stackoverflow.com/a/13751649/163740 WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd" - PIP: "%WRAPPER% %PYTHON%\\Scripts\\pip.exe" - SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension" + PYEXE: "%WRAPPER% %PYTHON%\\python.exe" + SETUPPY: "%PYEXE% setup.py --with-extension" + PIP: "%PYEXE% -m pip" + TWINE: "%PYEXE% -m twine" PYPI_USERNAME: "earwigbot" PYPI_PASSWORD: secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+ matrix: - - PYTHON: "C:\\Python27" - PYTHON_VERSION: "2.7" - PYTHON_ARCH: "32" - - - PYTHON: "C:\\Python27-x64" - PYTHON_VERSION: "2.7" - PYTHON_ARCH: "64" - - - PYTHON: "C:\\Python33" - PYTHON_VERSION: "3.3" - PYTHON_ARCH: "32" - - - PYTHON: "C:\\Python33-x64" - PYTHON_VERSION: "3.3" - PYTHON_ARCH: "64" - - PYTHON: "C:\\Python34" PYTHON_VERSION: "3.4" PYTHON_ARCH: "32" @@ -60,21 +46,38 @@ environment: PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" + - PYTHON: "C:\\Python37" + PYTHON_VERSION: "3.7" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python37-x64" + PYTHON_VERSION: "3.7" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python38" + PYTHON_VERSION: "3.8" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python38-x64" + PYTHON_VERSION: "3.8" + PYTHON_ARCH: "64" + install: - "%PIP% install --disable-pip-version-check --user --upgrade pip" - "%PIP% install wheel twine" build_script: - "%SETUPPY% build" + - "%SETUPPY% develop --user" test_script: - - "%SETUPPY% -q test" + - "%PYEXE% -m unittest discover" after_test: - "%SETUPPY% bdist_wheel" on_success: - - "IF %APPVEYOR_REPO_BRANCH%==master %WRAPPER% %PYTHON%\\python -m twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%" + - "IF %APPVEYOR_REPO_BRANCH%==master %TWINE% upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%" artifacts: - path: dist\* diff --git a/docs/changelog.rst b/docs/changelog.rst index ddfdde4..1ca7411 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -5,9 +5,55 @@ v0.6 ---- Unreleased -(`changes `__): +(`changes `__): -- ... +- Added support for Python 3.8. +- Updated Wikicode.matches() to recognize underscores as being equivalent + to spaces. (`#216 `_) +- Fixed a rare parsing bug involving deeply nested style tags. + (`#224 `_) + +v0.5.4 +------ + +`Released May 15, 2019 `_ +(`changes `__): + +- Fixed an unlikely crash in the C tokenizer when interrupted while parsing + a heading. + +v0.5.3 +------ + +`Released March 30, 2019 `_ +(`changes `__): + +- Fixed manual construction of Node objects, previously unsupported. + (`#214 `_) +- Fixed :class:`.Wikicode` transformation methods (:meth:`.Wikicode.replace`, + :meth:`.Wikicode.remove`, etc.) when passed an empty section as an argument. + (`#212 `_) +- Fixed the parser getting stuck inside malformed tables. + (`#206 `_) + +v0.5.2 +------ + +`Released November 1, 2018 `_ +(`changes `__): + +- Dropped support for end-of-life Python versions 2.6, 3.2, 3.3. + (`#199 `_, + `#204 `_) +- Fixed signals getting stuck inside the C tokenizer until parsing finishes, + in pathological cases. + (`#206 `_) +- Fixed `` not being considered a single-only tag. + (`#200 `_) +- Fixed a C tokenizer crash on Python 3.7 when compiled with assertions. + (`#208 `_) +- Cleaned up some minor documentation issues. + (`#207 `_) v0.5.1 ------ diff --git a/docs/conf.py b/docs/conf.py index 5ac9c70..9666cd0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,7 @@ master_doc = 'index' # General information about the project. project = u'mwparserfromhell' -copyright = u'2012–2018 Ben Kurtovic' +copyright = u'2012–2019 Ben Kurtovic' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/docs/index.rst b/docs/index.rst index 06dc2f9..1ca69f6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,15 +3,15 @@ MWParserFromHell v\ |version| Documentation :mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python package that provides an easy-to-use and outrageously powerful parser for MediaWiki_ -wikicode. It supports Python 2 and Python 3. +wikicode. It supports Python 3.4+. Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. Development occurs on GitHub_. -.. _MediaWiki: http://mediawiki.org -.. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig -.. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 -.. _Legoktm: http://en.wikipedia.org/wiki/User:Legoktm +.. _MediaWiki: https://www.mediawiki.org +.. _Earwig: https://en.wikipedia.org/wiki/User:The_Earwig +.. _Σ: https://en.wikipedia.org/wiki/User:%CE%A3 +.. _Legoktm: https://en.wikipedia.org/wiki/User:Legoktm .. _GitHub: https://github.com/earwig/mwparserfromhell Installation @@ -28,10 +28,10 @@ Alternatively, get the latest development version:: python setup.py install You can run the comprehensive unit testing suite with -``python setup.py test -q``. +``python -m unittest discover``. -.. _Python Package Index: http://pypi.python.org -.. _get pip: http://pypi.python.org/pypi/pip +.. _Python Package Index: https://pypi.org/ +.. _get pip: https://pypi.org/project/pip/ Contents -------- diff --git a/docs/integration.rst b/docs/integration.rst index af3abc9..8054d9f 100644 --- a/docs/integration.rst +++ b/docs/integration.rst @@ -7,7 +7,7 @@ Integration :func:`mwparserfromhell.parse() ` on :meth:`~earwigbot.wiki.page.Page.get`. -If you're using Pywikibot_, your code might look like this:: +If you're using Pywikibot_, your code might look like this: import mwparserfromhell import pywikibot @@ -18,23 +18,33 @@ If you're using Pywikibot_, your code might look like this:: text = page.get() return mwparserfromhell.parse(text) -If you're not using a library, you can parse any page using the following code -(via the API_):: +If you're not using a library, you can parse any page with the following +Python 3 code (using the API_ and the requests_ library): - import json - from urllib.parse import urlencode - from urllib.request import urlopen import mwparserfromhell + import requests + API_URL = "https://en.wikipedia.org/w/api.php" def parse(title): - data = {"action": "query", "prop": "revisions", "rvlimit": 1, - "rvprop": "content", "format": "json", "titles": title} - raw = urlopen(API_URL, urlencode(data).encode()).read() - res = json.loads(raw) - text = res["query"]["pages"].values()[0]["revisions"][0]["*"] + params = { + "action": "query", + "prop": "revisions", + "rvprop": "content", + "rvslots": "main", + "rvlimit": 1, + "titles": title, + "format": "json", + "formatversion": "2", + } + headers = {"User-Agent": "My-Bot-Name/1.0"} + req = requests.get(API_URL, headers=headers, params=params) + res = req.json() + revision = res["query"]["pages"][0]["revisions"][0] + text = revision["slots"]["main"]["content"] return mwparserfromhell.parse(text) .. _EarwigBot: https://github.com/earwig/earwigbot .. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot -.. _API: http://mediawiki.org/wiki/API +.. _API: https://www.mediawiki.org/wiki/API:Main_page +.. _requests: https://2.python-requests.org diff --git a/docs/usage.rst b/docs/usage.rst index ee667fd..2cdc690 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -7,8 +7,7 @@ Normal usage is rather straightforward (where ``text`` is page text):: >>> wikicode = mwparserfromhell.parse(text) ``wikicode`` is a :class:`mwparserfromhell.Wikicode <.Wikicode>` object, which -acts like an ordinary ``str`` object (or ``unicode`` in Python 2) with some -extra methods. For example:: +acts like an ordinary ``str`` object with some extra methods. For example:: >>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" >>> wikicode = mwparserfromhell.parse(text) @@ -78,7 +77,6 @@ saving the page!) by calling :func:`str` on it:: >>> text == code True -(Likewise, use :func:`unicode(code) ` in Python 2.) For more tips, check out :class:`Wikicode's full method list <.Wikicode>` and the :mod:`list of Nodes <.nodes>`. diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 11e1094..6056b83 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2018 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -23,16 +22,16 @@ """ `mwparserfromhell `_ (the MediaWiki Parser from Hell) is a Python package that provides an easy-to-use and -outrageously powerful parser for `MediaWiki `_ wikicode. +outrageously powerful parser for `MediaWiki `_ wikicode. """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic" +__copyright__ = "Copyright (C) 2012-2019 Ben Kurtovic" __license__ = "MIT License" __version__ = "0.6.dev0" __email__ = "ben.kurtovic@gmail.com" -from . import (compat, definitions, nodes, parser, smart_list, string_mixin, +from . import (definitions, nodes, parser, smart_list, string_mixin, utils, wikicode) parse = utils.parse_anything diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py deleted file mode 100644 index 7a83cd1..0000000 --- a/mwparserfromhell/compat.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Implements support for both Python 2 and Python 3 by defining common types in -terms of their Python 2/3 variants. For example, :class:`str` is set to -:class:`unicode` on Python 2 but :class:`str` on Python 3; likewise, -:class:`bytes` is :class:`str` on 2 but :class:`bytes` on 3. These types are -meant to be imported directly from within the parser's modules. -""" - -import sys - -py26 = (sys.version_info[0] == 2) and (sys.version_info[1] == 6) -py3k = (sys.version_info[0] == 3) -py32 = py3k and (sys.version_info[1] == 2) - -if py3k: - bytes = bytes - str = str - range = range - import html.entities as htmlentities - -else: - bytes = str - str = unicode - range = xrange - import htmlentitydefs as htmlentities - -del sys diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 18a06cc..6191dc6 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -28,7 +27,6 @@ When updating this file, please also update the the C tokenizer version: - mwparserfromhell/parser/ctokenizer/definitions.h """ -from __future__ import unicode_literals __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", "is_single_only", "is_scheme"] @@ -56,8 +54,8 @@ INVISIBLE_TAGS = [ "section", "templatedata", "timeline" ] -# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 -SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] +# [mediawiki/core.git]/includes/Sanitizer.php @ 065bec63ea +SINGLE_ONLY = ["br", "hr", "meta", "link", "img", "wbr"] SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] MARKUP_TO_HTML = { diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index 17ad3c3..6aa6ea4 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -29,13 +28,11 @@ the name of a :class:`.Template` is a :class:`.Wikicode` object that can contain text or more templates. """ -from __future__ import unicode_literals -from ..compat import str from ..string_mixin import StringMixIn -__all__ = ["Node", "Text", "Argument", "Heading", "HTMLEntity", "Tag", - "Template"] +__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading", + "Node", "Tag", "Template", "Text", "Wikilink"] class Node(StringMixIn): """Represents the base Node type, demonstrating the methods to override. @@ -56,6 +53,7 @@ class Node(StringMixIn): def __children__(self): return + # pylint: disable=unreachable yield # pragma: no cover (this is a generator that yields nothing) def __strip__(self, **kwargs): diff --git a/mwparserfromhell/nodes/argument.py b/mwparserfromhell/nodes/argument.py index 4259a35..4d9d613 100644 --- a/mwparserfromhell/nodes/argument.py +++ b/mwparserfromhell/nodes/argument.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,10 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from . import Node -from ..compat import str from ..utils import parse_anything __all__ = ["Argument"] @@ -32,9 +29,9 @@ class Argument(Node): """Represents a template argument substitution, like ``{{{foo}}}``.""" def __init__(self, name, default=None): - super(Argument, self).__init__() - self._name = name - self._default = default + super().__init__() + self.name = name + self.default = default def __unicode__(self): start = "{{{" + str(self.name) diff --git a/mwparserfromhell/nodes/comment.py b/mwparserfromhell/nodes/comment.py index 0d141e9..302699e 100644 --- a/mwparserfromhell/nodes/comment.py +++ b/mwparserfromhell/nodes/comment.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,10 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from . import Node -from ..compat import str __all__ = ["Comment"] @@ -31,11 +28,11 @@ class Comment(Node): """Represents a hidden HTML comment, like ````.""" def __init__(self, contents): - super(Comment, self).__init__() - self._contents = contents + super().__init__() + self.contents = contents def __unicode__(self): - return "" + return "" @property def contents(self): diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index f2659ab..4dc3594 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,10 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from . import Node -from ..compat import str from ..utils import parse_anything __all__ = ["ExternalLink"] @@ -32,10 +29,10 @@ class ExternalLink(Node): """Represents an external link, like ``[http://example.com/ Example]``.""" def __init__(self, url, title=None, brackets=True): - super(ExternalLink, self).__init__() - self._url = url - self._title = title - self._brackets = brackets + super().__init__() + self.url = url + self.title = title + self.brackets = brackets def __unicode__(self): if self.brackets: diff --git a/mwparserfromhell/nodes/extras/__init__.py b/mwparserfromhell/nodes/extras/__init__.py index 2d90b4e..43fe862 100644 --- a/mwparserfromhell/nodes/extras/__init__.py +++ b/mwparserfromhell/nodes/extras/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -21,8 +20,8 @@ # SOFTWARE. """ -This package contains objects used by :class:`.Node`\ s, but that are not nodes -themselves. This includes template parameters and HTML tag attributes. +This package contains objects used by :class:`.Node`\\ s, but that are not +nodes themselves. This includes template parameters and HTML tag attributes. """ from .attribute import Attribute diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 0f55a6b..38d2423 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,9 +19,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals -from ...compat import str from ...string_mixin import StringMixIn from ...utils import parse_anything @@ -37,16 +34,15 @@ class Attribute(StringMixIn): """ def __init__(self, name, value=None, quotes='"', pad_first=" ", - pad_before_eq="", pad_after_eq="", check_quotes=True): - super(Attribute, self).__init__() - if check_quotes and not quotes and self._value_needs_quotes(value): - raise ValueError("given value {0!r} requires quotes".format(value)) - self._name = name - self._value = value - self._quotes = quotes - self._pad_first = pad_first - self._pad_before_eq = pad_before_eq - self._pad_after_eq = pad_after_eq + pad_before_eq="", pad_after_eq=""): + super().__init__() + self.name = name + self._quotes = None + self.value = value + self.quotes = quotes + self.pad_first = pad_first + self.pad_before_eq = pad_before_eq + self.pad_after_eq = pad_after_eq def __unicode__(self): result = self.pad_first + str(self.name) + self.pad_before_eq @@ -59,10 +55,17 @@ class Attribute(StringMixIn): @staticmethod def _value_needs_quotes(val): - """Return the preferred quotes for the given value, or None.""" - if val and any(char.isspace() for char in val): - return ('"' in val and "'" in val) or ("'" if '"' in val else '"') - return None + """Return valid quotes for the given value, or None if unneeded.""" + if not val: + return None + val = "".join(str(node) for node in val.filter_text(recursive=False)) + if not any(char.isspace() for char in val): + return None + if "'" in val and '"' not in val: + return '"' + if '"' in val and "'" not in val: + return "'" + return "\"'" # Either acceptable, " preferred over ' def _set_padding(self, attr, value): """Setter for the value of a padding attribute.""" @@ -79,7 +82,7 @@ class Attribute(StringMixIn): """Coerce a quote type into an acceptable value, or raise an error.""" orig, quotes = quotes, str(quotes) if quotes else None if quotes not in [None, '"', "'"]: - raise ValueError("{0!r} is not a valid quote type".format(orig)) + raise ValueError("{!r} is not a valid quote type".format(orig)) return quotes @property @@ -123,8 +126,8 @@ class Attribute(StringMixIn): else: code = parse_anything(newval) quotes = self._value_needs_quotes(code) - if quotes in ['"', "'"] or (quotes is True and not self.quotes): - self._quotes = quotes + if quotes and (not self.quotes or self.quotes not in quotes): + self._quotes = quotes[0] self._value = code @quotes.setter diff --git a/mwparserfromhell/nodes/extras/parameter.py b/mwparserfromhell/nodes/extras/parameter.py index 0d21d5b..4478084 100644 --- a/mwparserfromhell/nodes/extras/parameter.py +++ b/mwparserfromhell/nodes/extras/parameter.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,10 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals import re -from ...compat import str from ...string_mixin import StringMixIn from ...utils import parse_anything @@ -39,12 +36,10 @@ class Parameter(StringMixIn): """ def __init__(self, name, value, showkey=True): - super(Parameter, self).__init__() - if not showkey and not self.can_hide_key(name): - raise ValueError("key {0!r} cannot be hidden".format(name)) - self._name = name - self._value = value - self._showkey = showkey + super().__init__() + self.name = name + self.value = value + self.showkey = showkey def __unicode__(self): if self.showkey: @@ -83,5 +78,6 @@ class Parameter(StringMixIn): def showkey(self, newval): newval = bool(newval) if not newval and not self.can_hide_key(self.name): - raise ValueError("parameter key cannot be hidden") + raise ValueError("parameter key {!r} cannot be hidden".format( + self.name)) self._showkey = newval diff --git a/mwparserfromhell/nodes/heading.py b/mwparserfromhell/nodes/heading.py index 79f3364..1fe8790 100644 --- a/mwparserfromhell/nodes/heading.py +++ b/mwparserfromhell/nodes/heading.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,10 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from . import Node -from ..compat import str from ..utils import parse_anything __all__ = ["Heading"] @@ -32,9 +29,9 @@ class Heading(Node): """Represents a section heading in wikicode, like ``== Foo ==``.""" def __init__(self, title, level): - super(Heading, self).__init__() - self._title = title - self._level = level + super().__init__() + self.title = title + self.level = level def __unicode__(self): return ("=" * self.level) + str(self.title) + ("=" * self.level) diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index d5e9d73..8a2eef4 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,10 +19,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import html.entities as htmlentities from . import Node -from ..compat import htmlentities, py3k, str __all__ = ["HTMLEntity"] @@ -31,7 +29,7 @@ class HTMLEntity(Node): """Represents an HTML entity, like `` ``, either named or unnamed.""" def __init__(self, value, named=None, hexadecimal=False, hex_char="x"): - super(HTMLEntity, self).__init__() + super().__init__() self._value = value if named is None: # Try to guess whether or not the entity is named try: @@ -53,42 +51,16 @@ class HTMLEntity(Node): def __unicode__(self): if self.named: - return "&{0};".format(self.value) + return "&{};".format(self.value) if self.hexadecimal: - return "&#{0}{1};".format(self.hex_char, self.value) - return "&#{0};".format(self.value) + return "&#{}{};".format(self.hex_char, self.value) + return "&#{};".format(self.value) def __strip__(self, **kwargs): if kwargs.get("normalize"): return self.normalize() return self - if not py3k: - @staticmethod - def _unichr(value): - """Implement builtin unichr() with support for non-BMP code points. - - On wide Python builds, this functions like the normal unichr(). On - narrow builds, this returns the value's encoded surrogate pair. - """ - try: - return unichr(value) - except ValueError: - # Test whether we're on the wide or narrow Python build. Check - # the length of a non-BMP code point - # (U+1F64A, SPEAK-NO-EVIL MONKEY): - if len("\U0001F64A") == 1: # pragma: no cover - raise - # Ensure this is within the range we can encode: - if value > 0x10FFFF: - raise ValueError("unichr() arg not in range(0x110000)") - code = value - 0x10000 - if value < 0: # Invalid code point - raise - lead = 0xD800 + (code >> 10) - trail = 0xDC00 + (code % (1 << 10)) - return unichr(lead) + unichr(trail) - @property def value(self): """The string value of the HTML entity.""" @@ -173,9 +145,8 @@ class HTMLEntity(Node): def normalize(self): """Return the unicode character represented by the HTML entity.""" - chrfunc = chr if py3k else HTMLEntity._unichr if self.named: - return chrfunc(htmlentities.name2codepoint[self.value]) + return chr(htmlentities.name2codepoint[self.value]) if self.hexadecimal: - return chrfunc(int(self.value, 16)) - return chrfunc(int(self.value)) + return chr(int(self.value, 16)) + return chr(int(self.value)) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index f0611a6..9fa45c5 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,11 +19,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from . import Node from .extras import Attribute -from ..compat import str from ..definitions import is_visible from ..utils import parse_anything @@ -37,29 +34,21 @@ class Tag(Node): self_closing=False, invalid=False, implicit=False, padding="", closing_tag=None, wiki_style_separator=None, closing_wiki_markup=None): - super(Tag, self).__init__() - self._tag = tag - if contents is None and not self_closing: - self._contents = parse_anything("") - else: - self._contents = contents + super().__init__() + self.tag = tag + self.contents = contents self._attrs = attrs if attrs else [] - self._wiki_markup = wiki_markup - self._self_closing = self_closing - self._invalid = invalid - self._implicit = implicit - self._padding = padding - if closing_tag: - self._closing_tag = closing_tag - else: - self._closing_tag = tag - self._wiki_style_separator = wiki_style_separator + self._closing_wiki_markup = None + self.wiki_markup = wiki_markup + self.self_closing = self_closing + self.invalid = invalid + self.implicit = implicit + self.padding = padding + if closing_tag is not None: + self.closing_tag = closing_tag + self.wiki_style_separator = wiki_style_separator if closing_wiki_markup is not None: - self._closing_wiki_markup = closing_wiki_markup - elif wiki_markup and not self_closing: - self._closing_wiki_markup = wiki_markup - else: - self._closing_wiki_markup = None + self.closing_wiki_markup = closing_wiki_markup def __unicode__(self): if self.wiki_markup: @@ -69,10 +58,10 @@ class Tag(Node): attrs = "" padding = self.padding or "" separator = self.wiki_style_separator or "" - close = self.closing_wiki_markup or "" if self.self_closing: return self.wiki_markup + attrs + padding + separator else: + close = self.closing_wiki_markup or "" return self.wiki_markup + attrs + padding + separator + \ str(self.contents) + close @@ -93,10 +82,10 @@ class Tag(Node): yield attr.name if attr.value is not None: yield attr.value - if self.contents: + if not self.self_closing: yield self.contents - if not self.self_closing and not self.wiki_markup and self.closing_tag: - yield self.closing_tag + if not self.wiki_markup and self.closing_tag: + yield self.closing_tag def __strip__(self, **kwargs): if self.contents and is_visible(self.tag): @@ -308,7 +297,10 @@ class Tag(Node): return attr def remove(self, name): - """Remove all attributes with the given *name*.""" + """Remove all attributes with the given *name*. + + Raises :exc:`ValueError` if none were found. + """ attrs = [attr for attr in self.attributes if attr.name == name.strip()] if not attrs: raise ValueError(name) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 08d4ff0..4eae877 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,13 +19,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from collections import defaultdict import re from . import HTMLEntity, Node, Text from .extras import Parameter -from ..compat import range, str from ..utils import parse_anything __all__ = ["Template"] @@ -37,8 +34,8 @@ class Template(Node): """Represents a template in wikicode, like ``{{foo}}``.""" def __init__(self, name, params=None): - super(Template, self).__init__() - self._name = name + super().__init__() + self.name = name if params: self._params = params else: @@ -108,7 +105,7 @@ class Template(Node): def _blank_param_value(value): """Remove the content from *value* while keeping its whitespace. - Replace *value*\ 's nodes with two text nodes, the first containing + Replace *value*\\ 's nodes with two text nodes, the first containing whitespace from before its content and the second containing whitespace from after its content. """ diff --git a/mwparserfromhell/nodes/text.py b/mwparserfromhell/nodes/text.py index a49930f..b07eedc 100644 --- a/mwparserfromhell/nodes/text.py +++ b/mwparserfromhell/nodes/text.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,10 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from . import Node -from ..compat import str __all__ = ["Text"] @@ -31,8 +28,8 @@ class Text(Node): """Represents ordinary, unformatted text with no special properties.""" def __init__(self, value): - super(Text, self).__init__() - self._value = value + super().__init__() + self.value = value def __unicode__(self): return self.value diff --git a/mwparserfromhell/nodes/wikilink.py b/mwparserfromhell/nodes/wikilink.py index 8f4bf7d..98ae75f 100644 --- a/mwparserfromhell/nodes/wikilink.py +++ b/mwparserfromhell/nodes/wikilink.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,10 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from . import Node -from ..compat import str from ..utils import parse_anything __all__ = ["Wikilink"] @@ -32,9 +29,9 @@ class Wikilink(Node): """Represents an internal wikilink, like ``[[Foo|Bar]]``.""" def __init__(self, title, text=None): - super(Wikilink, self).__init__() - self._title = title - self._text = text + super().__init__() + self.title = title + self.text = text def __unicode__(self): if self.text is not None: diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index f39fdc4..fb1bf20 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -35,8 +34,8 @@ class ParserError(Exception): can happen. Its appearance indicates a bug. """ def __init__(self, extra): - msg = "This is a bug and should be reported. Info: {0}.".format(extra) - super(ParserError, self).__init__(msg) + msg = "This is a bug and should be reported. Info: {}.".format(extra) + super().__init__(msg) from .builder import Builder @@ -50,13 +49,13 @@ except ImportError: __all__ = ["use_c", "Parser", "ParserError"] -class Parser(object): +class Parser: """Represents a parser for wikicode. Actual parsing is a two-step process: first, the text is split up into a series of tokens by the :class:`.Tokenizer`, and then the tokens are - converted into trees of :class:`.Wikicode` objects and :class:`.Node`\ s by - the :class:`.Builder`. + converted into trees of :class:`.Wikicode` objects and :class:`.Node`\\ s + by the :class:`.Builder`. Instances of this class or its dependents (:class:`.Tokenizer` and :class:`.Builder`) should not be shared between threads. :meth:`parse` can @@ -79,7 +78,7 @@ class Parser(object): If given, *context* will be passed as a starting context to the parser. This is helpful when this function is used inside node attribute - setters. For example, :class:`.ExternalLink`\ 's + setters. For example, :class:`.ExternalLink`\\ 's :attr:`~.ExternalLink.url` setter sets *context* to :mod:`contexts.EXT_LINK_URI <.contexts>` to prevent the URL itself from becoming an :class:`.ExternalLink`. diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index c86a923..1ae2150 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,10 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from . import tokens, ParserError -from ..compat import str from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, Template, Text, Wikilink) from ..nodes.extras import Attribute, Parameter @@ -45,10 +42,10 @@ def _add_handler(token_type): return decorator -class Builder(object): +class Builder: """Builds a tree of nodes out of a sequence of tokens. - To use, pass a list of :class:`.Token`\ s to the :meth:`build` method. The + To use, pass a list of :class:`.Token`\\ s to the :meth:`build` method. The list will be exhausted as it is parsed and a :class:`.Wikicode` object containing the node tree will be returned. """ @@ -237,8 +234,7 @@ class Builder(object): else: name, value = self._pop(), None return Attribute(name, value, quotes, start.pad_first, - start.pad_before_eq, start.pad_after_eq, - check_quotes=False) + start.pad_before_eq, start.pad_after_eq) else: self._write(self._handle_token(token)) raise ParserError("_handle_attribute() missed a close token") diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index af6dea6..b6d013e 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -193,3 +192,16 @@ UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK + +def describe(context): + """Return a string describing the given context value, for debugging.""" + flags = [] + for name, value in globals().items(): + if not isinstance(value, int) or name.startswith("GL_"): + continue + if bin(value).count("1") != 1: + continue # Hacky way to skip aggregate contexts + if context & value: + flags.append((name, value)) + flags.sort(key=lambda it: it[1]) + return "|".join(it[0] for it in flags) diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index f3d51f4..22a6b81 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -23,7 +23,7 @@ SOFTWARE. #pragma once #ifndef PY_SSIZE_T_CLEAN -#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html +#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html #endif #include @@ -34,10 +34,6 @@ SOFTWARE. /* Compatibility macros */ -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif - #ifndef uint64_t #define uint64_t unsigned PY_LONG_LONG #endif @@ -48,20 +44,8 @@ SOFTWARE. /* Unicode support macros */ -#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3 -#define PEP_393 -#endif - -#ifdef PEP_393 -#define Unicode Py_UCS4 #define PyUnicode_FROM_SINGLE(chr) \ PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) -#else -#define Unicode Py_UNICODE -#define PyUnicode_FROM_SINGLE(chr) \ - PyUnicode_FromUnicode(&(chr), 1) -#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE -#endif /* Error handling macros */ @@ -85,13 +69,9 @@ extern PyObject* definitions; typedef struct { Py_ssize_t capacity; Py_ssize_t length; -#ifdef PEP_393 PyObject* object; int kind; void* data; -#else - Py_UNICODE* data; -#endif } Textbuffer; typedef struct { @@ -111,12 +91,8 @@ typedef struct Stack Stack; typedef struct { PyObject* object; /* base PyUnicodeObject object */ Py_ssize_t length; /* length of object, in code points */ -#ifdef PEP_393 int kind; /* object's kind value */ void* data; /* object's raw unicode buffer */ -#else - Py_UNICODE* buf; /* object's internal buffer */ -#endif } TokenizerInput; typedef struct avl_tree_node avl_tree; diff --git a/mwparserfromhell/parser/ctokenizer/definitions.c b/mwparserfromhell/parser/ctokenizer/definitions.c index 38482a4..e247234 100644 --- a/mwparserfromhell/parser/ctokenizer/definitions.c +++ b/mwparserfromhell/parser/ctokenizer/definitions.c @@ -45,11 +45,12 @@ static const char* PARSER_BLACKLIST[] = { }; static const char* SINGLE[] = { - "br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL + "br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", + "wbr", NULL }; static const char* SINGLE_ONLY[] = { - "br", "hr", "meta", "link", "img", NULL + "br", "hr", "meta", "link", "img", "wbr", NULL }; /* diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.h b/mwparserfromhell/parser/ctokenizer/tag_data.h index c2e9303..7e8edcb 100644 --- a/mwparserfromhell/parser/ctokenizer/tag_data.h +++ b/mwparserfromhell/parser/ctokenizer/tag_data.h @@ -32,7 +32,7 @@ typedef struct { Textbuffer* pad_first; Textbuffer* pad_before_eq; Textbuffer* pad_after_eq; - Unicode quoter; + Py_UCS4 quoter; Py_ssize_t reset; } TagData; diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c index 3fd129f..e37b7c3 100644 --- a/mwparserfromhell/parser/ctokenizer/textbuffer.c +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c @@ -29,23 +29,16 @@ SOFTWARE. /* Internal allocation function for textbuffers. */ -static int internal_alloc(Textbuffer* self, Unicode maxchar) +static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar) { self->capacity = INITIAL_CAPACITY; self->length = 0; -#ifdef PEP_393 self->object = PyUnicode_New(self->capacity, maxchar); if (!self->object) return -1; self->kind = PyUnicode_KIND(self->object); self->data = PyUnicode_DATA(self->object); -#else - (void) maxchar; // Unused - self->data = malloc(sizeof(Unicode) * self->capacity); - if (!self->data) - return -1; -#endif return 0; } @@ -55,11 +48,7 @@ static int internal_alloc(Textbuffer* self, Unicode maxchar) */ static void internal_dealloc(Textbuffer* self) { -#ifdef PEP_393 Py_DECREF(self->object); -#else - free(self->data); -#endif } /* @@ -67,7 +56,6 @@ static void internal_dealloc(Textbuffer* self) */ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) { -#ifdef PEP_393 PyObject *newobj; void *newdata; @@ -79,10 +67,6 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) Py_DECREF(self->object); self->object = newobj; self->data = newdata; -#else - if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap))) - return -1; -#endif self->capacity = new_cap; return 0; @@ -94,11 +78,9 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) Textbuffer* Textbuffer_new(TokenizerInput* text) { Textbuffer* self = malloc(sizeof(Textbuffer)); - Unicode maxchar = 0; + Py_UCS4 maxchar = 0; -#ifdef PEP_393 maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); -#endif if (!self) goto fail_nomem; @@ -127,11 +109,9 @@ void Textbuffer_dealloc(Textbuffer* self) */ int Textbuffer_reset(Textbuffer* self) { - Unicode maxchar = 0; + Py_UCS4 maxchar = 0; -#ifdef PEP_393 maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); -#endif internal_dealloc(self); if (internal_alloc(self, maxchar)) @@ -142,18 +122,14 @@ int Textbuffer_reset(Textbuffer* self) /* Write a Unicode codepoint to the given textbuffer. */ -int Textbuffer_write(Textbuffer* self, Unicode code) +int Textbuffer_write(Textbuffer* self, Py_UCS4 code) { if (self->length >= self->capacity) { if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) return -1; } -#ifdef PEP_393 PyUnicode_WRITE(self->kind, self->data, self->length++, code); -#else - self->data[self->length++] = code; -#endif return 0; } @@ -163,13 +139,9 @@ int Textbuffer_write(Textbuffer* self, Unicode code) This function does not check for bounds. */ -Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index) +Py_UCS4 Textbuffer_read(Textbuffer* self, Py_ssize_t index) { -#ifdef PEP_393 return PyUnicode_READ(self->kind, self->data, index); -#else - return self->data[index]; -#endif } /* @@ -177,11 +149,7 @@ Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index) */ PyObject* Textbuffer_render(Textbuffer* self) { -#ifdef PEP_393 return PyUnicode_FromKindAndData(self->kind, self->data, self->length); -#else - return PyUnicode_FromUnicode(self->data, self->length); -#endif } /* @@ -196,14 +164,9 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other) return -1; } -#ifdef PEP_393 assert(self->kind == other->kind); memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data, other->length * other->kind); -#else - memcpy(self->data + self->length, other->data, - other->length * sizeof(Unicode)); -#endif self->length = newlen; return 0; @@ -215,18 +178,12 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other) void Textbuffer_reverse(Textbuffer* self) { Py_ssize_t i, end = self->length - 1; - Unicode tmp; + Py_UCS4 tmp; for (i = 0; i < self->length / 2; i++) { -#ifdef PEP_393 tmp = PyUnicode_READ(self->kind, self->data, i); PyUnicode_WRITE(self->kind, self->data, i, PyUnicode_READ(self->kind, self->data, end - i)); PyUnicode_WRITE(self->kind, self->data, end - i, tmp); -#else - tmp = self->data[i]; - self->data[i] = self->data[end - i]; - self->data[end - i] = tmp; -#endif } } diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h index 35579fd..85b39bc 100644 --- a/mwparserfromhell/parser/ctokenizer/textbuffer.h +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h @@ -29,8 +29,8 @@ SOFTWARE. Textbuffer* Textbuffer_new(TokenizerInput*); void Textbuffer_dealloc(Textbuffer*); int Textbuffer_reset(Textbuffer*); -int Textbuffer_write(Textbuffer*, Unicode); -Unicode Textbuffer_read(Textbuffer*, Py_ssize_t); +int Textbuffer_write(Textbuffer*, Py_UCS4); +Py_UCS4 Textbuffer_read(Textbuffer*, Py_ssize_t); PyObject* Textbuffer_render(Textbuffer*); int Textbuffer_concat(Textbuffer*, Textbuffer*); void Textbuffer_reverse(Textbuffer*); diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 1998368..be7018b 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2018 Ben Kurtovic +Copyright (C) 2012-2019 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -52,7 +52,7 @@ static int Tokenizer_parse_tag(Tokenizer*); /* Determine whether the given code point is a marker. */ -static int is_marker(Unicode this) +static int is_marker(Py_UCS4 this) { int i; @@ -442,7 +442,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) static const char* valid = URISCHEME; Textbuffer* buffer; PyObject* scheme; - Unicode this; + Py_UCS4 this; int slashes, i; if (Tokenizer_check_route(self, LC_EXT_LINK_URI) < 0) @@ -463,7 +463,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) while (1) { if (!valid[i]) goto end_of_loop; - if (this == (Unicode) valid[i]) + if (this == (Py_UCS4) valid[i]) break; i++; } @@ -516,7 +516,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) static const char* valid = URISCHEME; Textbuffer *scheme_buffer = Textbuffer_new(&self->text); PyObject *scheme; - Unicode chunk; + Py_UCS4 chunk; Py_ssize_t i; int slashes, j; uint64_t new_context; @@ -536,7 +536,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) FAIL_ROUTE(0); return 0; } - } while (chunk != (Unicode) valid[j++]); + } while (chunk != (Py_UCS4) valid[j++]); Textbuffer_write(scheme_buffer, chunk); } end_of_loop: @@ -580,7 +580,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Handle text in a free external link, including trailing punctuation. */ static int Tokenizer_handle_free_link_text( - Tokenizer* self, int* parens, Textbuffer* tail, Unicode this) + Tokenizer* self, int* parens, Textbuffer* tail, Py_UCS4 this) { #define PUSH_TAIL_BUFFER(tail, error) \ if (tail && tail->length > 0) { \ @@ -607,10 +607,10 @@ static int Tokenizer_handle_free_link_text( Return whether the current head is the end of a free link. */ static int -Tokenizer_is_free_link(Tokenizer* self, Unicode this, Unicode next) +Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) { // Built from Tokenizer_parse()'s end sentinels: - Unicode after = Tokenizer_read(self, 2); + Py_UCS4 after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || @@ -628,7 +628,7 @@ static PyObject* Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, Textbuffer* extra) { - Unicode this, next; + Py_UCS4 this, next; int parens = 0; if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : @@ -813,11 +813,10 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } -#ifdef IS_PY3K + if (!heading) { + return -1; + } level = PyLong_FromSsize_t(heading->level); -#else - level = PyInt_FromSsize_t(heading->level); -#endif if (!level) { Py_DECREF(heading->title); free(heading); @@ -892,6 +891,9 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) self->head = reset + best - 1; } else { + if (!after) { + return NULL; + } for (i = 0; i < best; i++) { if (Tokenizer_emit_char(self, '=')) { Py_DECREF(after->title); @@ -927,7 +929,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) static int Tokenizer_really_parse_entity(Tokenizer* self) { PyObject *kwargs, *charobj, *textobj; - Unicode this; + Py_UCS4 this; int numeric, hexadecimal, i, j, zeroes, test; char *valid, *text, *buffer, *def; @@ -1008,7 +1010,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) while (1) { if (!valid[j]) FAIL_ROUTE_AND_EXIT() - if (this == (Unicode) valid[j]) + if (this == (Py_UCS4) valid[j]) break; j++; } @@ -1105,7 +1107,7 @@ static int Tokenizer_parse_comment(Tokenizer* self) { Py_ssize_t reset = self->head + 3; PyObject *comment; - Unicode this; + Py_UCS4 this; self->head += 4; if (Tokenizer_push(self, 0)) @@ -1205,7 +1207,7 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) Handle whitespace inside of an HTML open tag. */ static int Tokenizer_handle_tag_space( - Tokenizer* self, TagData* data, Unicode text) + Tokenizer* self, TagData* data, Py_UCS4 text) { uint64_t ctx = data->context; uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && @@ -1237,9 +1239,9 @@ static int Tokenizer_handle_tag_space( /* Handle regular text inside of an HTML open tag. */ -static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text) +static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UCS4 text) { - Unicode next = Tokenizer_read(self, 1); + Py_UCS4 next = Tokenizer_read(self, 1); if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self)) return Tokenizer_emit_char(self, text); @@ -1256,7 +1258,7 @@ static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text) Handle all sorts of text data inside of an HTML open tag. */ static int Tokenizer_handle_tag_data( - Tokenizer* self, TagData* data, Unicode chunk) + Tokenizer* self, TagData* data, Py_UCS4 chunk) { PyObject *trash; int first_time, escaped; @@ -1438,7 +1440,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self) { Textbuffer* buffer; PyObject *buf_tmp, *end_tag, *start_tag; - Unicode this, next; + Py_UCS4 this, next; Py_ssize_t reset; int cmp; @@ -1594,7 +1596,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) { TagData *data = TagData_new(&self->text); PyObject *token, *text, *trash; - Unicode this, next; + Py_UCS4 this, next; int can_exit; if (!data) @@ -1680,7 +1682,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) Py_ssize_t reset = self->head + 1, pos = 0; Textbuffer* buf; PyObject *name, *tag; - Unicode this; + Py_UCS4 this; self->head += 2; buf = Textbuffer_new(&self->text); @@ -1801,6 +1803,11 @@ static int Tokenizer_parse_italics(Tokenizer* self) if (BAD_ROUTE_CONTEXT & LC_STYLE_PASS_AGAIN) { context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS; stack = Tokenizer_parse(self, context, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + return Tokenizer_emit_text(self, "''"); + } } else return Tokenizer_emit_text(self, "''"); @@ -1977,7 +1984,7 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self) static int Tokenizer_handle_list_marker(Tokenizer* self) { PyObject *kwargs, *markup; - Unicode code = Tokenizer_read(self, 0); + Py_UCS4 code = Tokenizer_read(self, 0); if (code == ';') self->topstack->context |= LC_DLTERM; @@ -2004,7 +2011,7 @@ static int Tokenizer_handle_list_marker(Tokenizer* self) */ static int Tokenizer_handle_list(Tokenizer* self) { - Unicode marker = Tokenizer_read(self, 1); + Py_UCS4 marker = Tokenizer_read(self, 1); if (Tokenizer_handle_list_marker(self)) return -1; @@ -2158,11 +2165,11 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, /* Handle style attributes for a table until an ending token. */ -static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token) +static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Py_UCS4 end_token) { TagData *data = TagData_new(&self->text); PyObject *padding, *trash; - Unicode this; + Py_UCS4 this; int can_exit; if (!data) @@ -2254,6 +2261,7 @@ static int Tokenizer_parse_table(Tokenizer* self) Py_DECREF(padding); Py_DECREF(style); while (!Tokenizer_IS_CURRENT_STACK(self, restore_point)) { + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); } @@ -2471,7 +2479,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) everything is safe, or -1 if the route must be failed. */ static int -Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data) +Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UCS4 data) { if (context & LC_FAIL_NEXT) return -1; @@ -2556,7 +2564,7 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data) static int Tokenizer_has_leading_whitespace(Tokenizer* self) { int offset = 1; - Unicode current_character; + Py_UCS4 current_character; while (1) { current_character = Tokenizer_read_backwards(self, offset); if (!current_character || current_character == '\n') @@ -2574,7 +2582,7 @@ static int Tokenizer_has_leading_whitespace(Tokenizer* self) PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) { uint64_t this_context; - Unicode this, next, next_next, last; + Py_UCS4 this, next, next_next, last; PyObject* temp; if (push) { @@ -2603,6 +2611,8 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) } if (!this) return Tokenizer_handle_end(self, this_context); + if (PyErr_CheckSignals()) + return NULL; next = Tokenizer_read(self, 1); last = Tokenizer_read_backwards(self, 1); if (this == next && next == '{') { diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.h b/mwparserfromhell/parser/ctokenizer/tok_parse.h index 9d98b00..bdae573 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.h +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.h @@ -24,7 +24,7 @@ SOFTWARE. #include "common.h" -static const Unicode MARKERS[] = { +static const Py_UCS4 MARKERS[] = { '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', '-', '!', '\n', '\0'}; diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c index 30dc2a1..bf554f6 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -275,7 +275,7 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, /* Write a Unicode codepoint to the current textbuffer. */ -int Tokenizer_emit_char(Tokenizer* self, Unicode code) +int Tokenizer_emit_char(Tokenizer* self, Py_UCS4 code) { return Textbuffer_write(self->topstack->textbuffer, code); } @@ -389,19 +389,15 @@ int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) /* Internal function to read the codepoint at the given index from the input. */ -static Unicode read_codepoint(TokenizerInput* text, Py_ssize_t index) +static Py_UCS4 read_codepoint(TokenizerInput* text, Py_ssize_t index) { -#ifdef PEP_393 return PyUnicode_READ(text->kind, text->data, index); -#else - return text->buf[index]; -#endif } /* Read the value at a relative point in the wikicode, forwards. */ -Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta) +Py_UCS4 Tokenizer_read(Tokenizer* self, Py_ssize_t delta) { Py_ssize_t index = self->head + delta; @@ -413,7 +409,7 @@ Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta) /* Read the value at a relative point in the wikicode, backwards. */ -Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) +Py_UCS4 Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) { Py_ssize_t index; diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h index f65d102..d08f5c4 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.h +++ b/mwparserfromhell/parser/ctokenizer/tok_support.h @@ -38,14 +38,14 @@ void Tokenizer_free_bad_route_tree(Tokenizer*); int Tokenizer_emit_token(Tokenizer*, PyObject*, int); int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); -int Tokenizer_emit_char(Tokenizer*, Unicode); +int Tokenizer_emit_char(Tokenizer*, Py_UCS4); int Tokenizer_emit_text(Tokenizer*, const char*); int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*); int Tokenizer_emit_all(Tokenizer*, PyObject*); int Tokenizer_emit_text_then_stack(Tokenizer*, const char*); -Unicode Tokenizer_read(Tokenizer*, Py_ssize_t); -Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); +Py_UCS4 Tokenizer_read(Tokenizer*, Py_ssize_t); +Py_UCS4 Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); /* Macros */ diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c index 9017909..a501032 100644 --- a/mwparserfromhell/parser/ctokenizer/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c @@ -85,12 +85,8 @@ static void init_tokenizer_text(TokenizerInput* text) text->object = Py_None; Py_INCREF(Py_None); text->length = 0; -#ifdef PEP_393 text->kind = PyUnicode_WCHAR_KIND; text->data = NULL; -#else - text->buf = NULL; -#endif } /* @@ -119,14 +115,10 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input) dealloc_tokenizer_text(text); text->object = input; -#ifdef PEP_393 if (PyUnicode_READY(input) < 0) return -1; text->kind = PyUnicode_KIND(input); text->data = PyUnicode_DATA(input); -#else - text->buf = PyUnicode_AS_UNICODE(input); -#endif text->length = PyUnicode_GET_LENGTH(input); return 0; } @@ -192,11 +184,9 @@ static int load_entities(void) { PyObject *tempmod, *defmap, *deflist; unsigned numdefs, i; -#ifdef IS_PY3K PyObject *string; -#endif - tempmod = PyImport_ImportModule(ENTITYDEFS_MODULE); + tempmod = PyImport_ImportModule("html.entities"); if (!tempmod) return -1; defmap = PyObject_GetAttrString(tempmod, "entitydefs"); @@ -207,19 +197,15 @@ static int load_entities(void) if (!deflist) return -1; Py_DECREF(defmap); - numdefs = (unsigned) PyList_GET_SIZE(defmap); + numdefs = (unsigned) PyList_GET_SIZE(deflist); entitydefs = calloc(numdefs + 1, sizeof(char*)); if (!entitydefs) return -1; for (i = 0; i < numdefs; i++) { -#ifdef IS_PY3K string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i)); if (!string) return -1; entitydefs[i] = PyBytes_AsString(string); -#else - entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); -#endif if (!entitydefs[i]) return -1; } @@ -233,7 +219,7 @@ static int load_tokens(void) *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(), *fromlist = PyList_New(1), - *modname = IMPORT_NAME_FUNC("tokens"); + *modname = PyUnicode_FromString("tokens"); char *name = "mwparserfromhell.parser"; if (!fromlist || !modname) @@ -256,7 +242,7 @@ static int load_defs(void) *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(), *fromlist = PyList_New(1), - *modname = IMPORT_NAME_FUNC("definitions"); + *modname = PyUnicode_FromString("definitions"); char *name = "mwparserfromhell"; if (!fromlist || !modname) @@ -277,7 +263,7 @@ static int load_exceptions(void) *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(), *fromlist = PyList_New(1), - *modname = IMPORT_NAME_FUNC("parser"); + *modname = PyUnicode_FromString("parser"); char *name = "mwparserfromhell"; if (!fromlist || !modname) @@ -294,24 +280,22 @@ static int load_exceptions(void) return 0; } -PyMODINIT_FUNC INIT_FUNC_NAME(void) +PyMODINIT_FUNC PyInit__tokenizer(void) { PyObject *module; TokenizerType.tp_new = PyType_GenericNew; if (PyType_Ready(&TokenizerType) < 0) - INIT_ERROR; - module = CREATE_MODULE; + return NULL; + module = PyModule_Create(&module_def); if (!module) - INIT_ERROR; + return NULL; Py_INCREF(&TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); Py_INCREF(Py_True); PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); NOARGS = PyTuple_New(0); if (!NOARGS || load_entities() || load_tokens() || load_defs()) - INIT_ERROR; -#ifdef IS_PY3K + return NULL; return module; -#endif } diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.h b/mwparserfromhell/parser/ctokenizer/tokenizer.h index 6050ce0..ac98d79 100644 --- a/mwparserfromhell/parser/ctokenizer/tokenizer.h +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.h @@ -32,22 +32,6 @@ static void Tokenizer_dealloc(Tokenizer*); static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); -/* Compatibility macros */ - -#ifdef IS_PY3K - #define IMPORT_NAME_FUNC PyUnicode_FromString - #define CREATE_MODULE PyModule_Create(&module_def); - #define ENTITYDEFS_MODULE "html.entities" - #define INIT_FUNC_NAME PyInit__tokenizer - #define INIT_ERROR return NULL -#else - #define IMPORT_NAME_FUNC PyBytes_FromString - #define CREATE_MODULE Py_InitModule("_tokenizer", NULL); - #define ENTITYDEFS_MODULE "htmlentitydefs" - #define INIT_FUNC_NAME init_tokenizer - #define INIT_ERROR return -#endif - /* Structs */ static PyMethodDef Tokenizer_methods[] = { @@ -101,11 +85,9 @@ static PyTypeObject TokenizerType = { Tokenizer_new, /* tp_new */ }; -#ifdef IS_PY3K static PyModuleDef module_def = { PyModuleDef_HEAD_INIT, "_tokenizer", "Creates a list of tokens from a string of wikicode.", -1, NULL, NULL, NULL, NULL, NULL }; -#endif diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 1bfbc8d..a95c477 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2018 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,12 +19,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import html.entities as htmlentities from math import log import re from . import contexts, tokens, ParserError -from ..compat import htmlentities, range from ..definitions import (get_html_tag, is_parsable, is_single, is_single_only, is_scheme) @@ -35,11 +33,11 @@ class BadRoute(Exception): """Raised internally when the current tokenization route is invalid.""" def __init__(self, context=0): - super(BadRoute, self).__init__() + super().__init__() self.context = context -class _TagOpenData(object): +class _TagOpenData: """Stores data about an HTML open tag, like ````.""" CX_NAME = 1 << 0 CX_ATTR_READY = 1 << 1 @@ -57,7 +55,7 @@ class _TagOpenData(object): self.reset = 0 -class Tokenizer(object): +class Tokenizer: """Creates a list of tokens from a string of wikicode.""" USES_C = False START = object() @@ -455,7 +453,7 @@ class Tokenizer(object): else: self._parse_free_uri_scheme() invalid = ("\n", " ", "[", "]") - punct = tuple(",;\.:!?)") + punct = tuple(",;\\.:!?)") if self._read() is self.END or self._read()[0] in invalid: self._fail_route() tail = "" @@ -931,7 +929,11 @@ class Tokenizer(object): self._head = reset if route.context & contexts.STYLE_PASS_AGAIN: new_ctx = contexts.STYLE_ITALICS | contexts.STYLE_SECOND_PASS - stack = self._parse(new_ctx) + try: + stack = self._parse(new_ctx) + except BadRoute: + self._head = reset + return self._emit_text("''") else: return self._emit_text("''") self._emit_style_tag("i", "''", stack) @@ -1133,6 +1135,7 @@ class Tokenizer(object): table = self._parse(contexts.TABLE_OPEN) except BadRoute: while self._stack_ident != restore_point: + self._memoize_bad_route() self._pop() self._head = reset self._emit_text("{") diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 036dc9b..ec99c67 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -28,9 +27,6 @@ a syntactically valid form by the :class:`.Tokenizer`, and then converted into the :class`.Wikicode` tree by the :class:`.Builder`. """ -from __future__ import unicode_literals - -from ..compat import py3k, str __all__ = ["Token"] @@ -44,7 +40,7 @@ class Token(dict): args.append(key + "=" + repr(value[:97] + "...")) else: args.append(key + "=" + repr(value)) - return "{0}({1})".format(type(self).__name__, ", ".join(args)) + return "{}({})".format(type(self).__name__, ", ".join(args)) def __eq__(self, other): return isinstance(other, type(self)) and dict.__eq__(self, other) @@ -65,7 +61,7 @@ class Token(dict): def make(name): """Create a new Token class using ``type()`` and add it to ``__all__``.""" __all__.append(name) - return type(name if py3k else name.encode("utf8"), (Token,), {}) + return type(name, (Token,), {}) Text = make("Text") diff --git a/mwparserfromhell/smart_list.py b/mwparserfromhell/smart_list.py deleted file mode 100644 index e7fa59f..0000000 --- a/mwparserfromhell/smart_list.py +++ /dev/null @@ -1,456 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2012-2016 Ben Kurtovic -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -""" -This module contains the :class:`.SmartList` type, as well as its -:class:`._ListProxy` child, which together implement a list whose sublists -reflect changes made to the main list, and vice-versa. -""" - -from __future__ import unicode_literals -from sys import maxsize -from weakref import ref - -from .compat import py3k - -__all__ = ["SmartList"] - -def inheritdoc(method): - """Set __doc__ of *method* to __doc__ of *method* in its parent class. - - Since this is used on :class:`.SmartList`, the "parent class" used is - ``list``. This function can be used as a decorator. - """ - method.__doc__ = getattr(list, method.__name__).__doc__ - return method - - -class _SliceNormalizerMixIn(object): - """MixIn that provides a private method to normalize slices.""" - - def _normalize_slice(self, key, clamp=False): - """Return a slice equivalent to the input *key*, standardized.""" - if key.start is None: - start = 0 - else: - start = (len(self) + key.start) if key.start < 0 else key.start - if key.stop is None or key.stop == maxsize: - stop = len(self) if clamp else None - else: - stop = (len(self) + key.stop) if key.stop < 0 else key.stop - return slice(start, stop, key.step or 1) - - -class SmartList(_SliceNormalizerMixIn, list): - """Implements the ``list`` interface with special handling of sublists. - - When a sublist is created (by ``list[i:j]``), any changes made to this - list (such as the addition, removal, or replacement of elements) will be - reflected in the sublist, or vice-versa, to the greatest degree possible. - This is implemented by having sublists - instances of the - :class:`._ListProxy` type - dynamically determine their elements by storing - their slice info and retrieving that slice from the parent. Methods that - change the size of the list also change the slice info. For example:: - - >>> parent = SmartList([0, 1, 2, 3]) - >>> parent - [0, 1, 2, 3] - >>> child = parent[2:] - >>> child - [2, 3] - >>> child.append(4) - >>> child - [2, 3, 4] - >>> parent - [0, 1, 2, 3, 4] - """ - - def __init__(self, iterable=None): - if iterable: - super(SmartList, self).__init__(iterable) - else: - super(SmartList, self).__init__() - self._children = {} - - def __getitem__(self, key): - if not isinstance(key, slice): - return super(SmartList, self).__getitem__(key) - key = self._normalize_slice(key, clamp=False) - sliceinfo = [key.start, key.stop, key.step] - child = _ListProxy(self, sliceinfo) - child_ref = ref(child, self._delete_child) - self._children[id(child_ref)] = (child_ref, sliceinfo) - return child - - def __setitem__(self, key, item): - if not isinstance(key, slice): - return super(SmartList, self).__setitem__(key, item) - item = list(item) - super(SmartList, self).__setitem__(key, item) - key = self._normalize_slice(key, clamp=True) - diff = len(item) + (key.start - key.stop) // key.step - if not diff: - return - values = self._children.values if py3k else self._children.itervalues - for child, (start, stop, step) in values(): - if start > key.stop: - self._children[id(child)][1][0] += diff - if stop is not None and stop >= key.stop: - self._children[id(child)][1][1] += diff - - def __delitem__(self, key): - super(SmartList, self).__delitem__(key) - if isinstance(key, slice): - key = self._normalize_slice(key, clamp=True) - else: - key = slice(key, key + 1, 1) - diff = (key.stop - key.start) // key.step - values = self._children.values if py3k else self._children.itervalues - for child, (start, stop, step) in values(): - if start > key.start: - self._children[id(child)][1][0] -= diff - if stop is not None and stop >= key.stop: - self._children[id(child)][1][1] -= diff - - if not py3k: - def __getslice__(self, start, stop): - return self.__getitem__(slice(start, stop)) - - def __setslice__(self, start, stop, iterable): - self.__setitem__(slice(start, stop), iterable) - - def __delslice__(self, start, stop): - self.__delitem__(slice(start, stop)) - - def __add__(self, other): - return SmartList(list(self) + other) - - def __radd__(self, other): - return SmartList(other + list(self)) - - def __iadd__(self, other): - self.extend(other) - return self - - def _delete_child(self, child_ref): - """Remove a child reference that is about to be garbage-collected.""" - del self._children[id(child_ref)] - - def _detach_children(self): - """Remove all children and give them independent parent copies.""" - children = [val[0] for val in self._children.values()] - for child in children: - child()._parent = list(self) - self._children.clear() - - @inheritdoc - def append(self, item): - head = len(self) - self[head:head] = [item] - - @inheritdoc - def extend(self, item): - head = len(self) - self[head:head] = item - - @inheritdoc - def insert(self, index, item): - self[index:index] = [item] - - @inheritdoc - def pop(self, index=None): - if index is None: - index = len(self) - 1 - item = self[index] - del self[index] - return item - - @inheritdoc - def remove(self, item): - del self[self.index(item)] - - @inheritdoc - def reverse(self): - self._detach_children() - super(SmartList, self).reverse() - - if py3k: - @inheritdoc - def sort(self, key=None, reverse=None): - self._detach_children() - kwargs = {} - if key is not None: - kwargs["key"] = key - if reverse is not None: - kwargs["reverse"] = reverse - super(SmartList, self).sort(**kwargs) - else: - @inheritdoc - def sort(self, cmp=None, key=None, reverse=None): - self._detach_children() - kwargs = {} - if cmp is not None: - kwargs["cmp"] = cmp - if key is not None: - kwargs["key"] = key - if reverse is not None: - kwargs["reverse"] = reverse - super(SmartList, self).sort(**kwargs) - - -class _ListProxy(_SliceNormalizerMixIn, list): - """Implement the ``list`` interface by getting elements from a parent. - - This is created by a :class:`.SmartList` object when slicing. It does not - actually store the list at any time; instead, whenever the list is needed, - it builds it dynamically using the :meth:`_render` method. - """ - - def __init__(self, parent, sliceinfo): - super(_ListProxy, self).__init__() - self._parent = parent - self._sliceinfo = sliceinfo - - def __repr__(self): - return repr(self._render()) - - def __lt__(self, other): - if isinstance(other, _ListProxy): - return self._render() < list(other) - return self._render() < other - - def __le__(self, other): - if isinstance(other, _ListProxy): - return self._render() <= list(other) - return self._render() <= other - - def __eq__(self, other): - if isinstance(other, _ListProxy): - return self._render() == list(other) - return self._render() == other - - def __ne__(self, other): - if isinstance(other, _ListProxy): - return self._render() != list(other) - return self._render() != other - - def __gt__(self, other): - if isinstance(other, _ListProxy): - return self._render() > list(other) - return self._render() > other - - def __ge__(self, other): - if isinstance(other, _ListProxy): - return self._render() >= list(other) - return self._render() >= other - - if py3k: - def __bool__(self): - return bool(self._render()) - else: - def __nonzero__(self): - return bool(self._render()) - - def __len__(self): - return max((self._stop - self._start) // self._step, 0) - - def __getitem__(self, key): - if isinstance(key, slice): - key = self._normalize_slice(key, clamp=True) - keystart = min(self._start + key.start, self._stop) - keystop = min(self._start + key.stop, self._stop) - adjusted = slice(keystart, keystop, key.step) - return self._parent[adjusted] - else: - return self._render()[key] - - def __setitem__(self, key, item): - if isinstance(key, slice): - key = self._normalize_slice(key, clamp=True) - keystart = min(self._start + key.start, self._stop) - keystop = min(self._start + key.stop, self._stop) - adjusted = slice(keystart, keystop, key.step) - self._parent[adjusted] = item - else: - length = len(self) - if key < 0: - key = length + key - if key < 0 or key >= length: - raise IndexError("list assignment index out of range") - self._parent[self._start + key] = item - - def __delitem__(self, key): - if isinstance(key, slice): - key = self._normalize_slice(key, clamp=True) - keystart = min(self._start + key.start, self._stop) - keystop = min(self._start + key.stop, self._stop) - adjusted = slice(keystart, keystop, key.step) - del self._parent[adjusted] - else: - length = len(self) - if key < 0: - key = length + key - if key < 0 or key >= length: - raise IndexError("list assignment index out of range") - del self._parent[self._start + key] - - def __iter__(self): - i = self._start - while i < self._stop: - yield self._parent[i] - i += self._step - - def __reversed__(self): - i = self._stop - 1 - while i >= self._start: - yield self._parent[i] - i -= self._step - - def __contains__(self, item): - return item in self._render() - - if not py3k: - def __getslice__(self, start, stop): - return self.__getitem__(slice(start, stop)) - - def __setslice__(self, start, stop, iterable): - self.__setitem__(slice(start, stop), iterable) - - def __delslice__(self, start, stop): - self.__delitem__(slice(start, stop)) - - def __add__(self, other): - return SmartList(list(self) + other) - - def __radd__(self, other): - return SmartList(other + list(self)) - - def __iadd__(self, other): - self.extend(other) - return self - - def __mul__(self, other): - return SmartList(list(self) * other) - - def __rmul__(self, other): - return SmartList(other * list(self)) - - def __imul__(self, other): - self.extend(list(self) * (other - 1)) - return self - - @property - def _start(self): - """The starting index of this list, inclusive.""" - return self._sliceinfo[0] - - @property - def _stop(self): - """The ending index of this list, exclusive.""" - if self._sliceinfo[1] is None: - return len(self._parent) - return self._sliceinfo[1] - - @property - def _step(self): - """The number to increase the index by between items.""" - return self._sliceinfo[2] - - def _render(self): - """Return the actual list from the stored start/stop/step.""" - return list(self._parent)[self._start:self._stop:self._step] - - @inheritdoc - def append(self, item): - self._parent.insert(self._stop, item) - - @inheritdoc - def count(self, item): - return self._render().count(item) - - @inheritdoc - def index(self, item, start=None, stop=None): - if start is not None: - if stop is not None: - return self._render().index(item, start, stop) - return self._render().index(item, start) - return self._render().index(item) - - @inheritdoc - def extend(self, item): - self._parent[self._stop:self._stop] = item - - @inheritdoc - def insert(self, index, item): - if index < 0: - index = len(self) + index - self._parent.insert(self._start + index, item) - - @inheritdoc - def pop(self, index=None): - length = len(self) - if index is None: - index = length - 1 - elif index < 0: - index = length + index - if index < 0 or index >= length: - raise IndexError("pop index out of range") - return self._parent.pop(self._start + index) - - @inheritdoc - def remove(self, item): - index = self.index(item) - del self._parent[self._start + index] - - @inheritdoc - def reverse(self): - item = self._render() - item.reverse() - self._parent[self._start:self._stop:self._step] = item - - if py3k: - @inheritdoc - def sort(self, key=None, reverse=None): - item = self._render() - kwargs = {} - if key is not None: - kwargs["key"] = key - if reverse is not None: - kwargs["reverse"] = reverse - item.sort(**kwargs) - self._parent[self._start:self._stop:self._step] = item - else: - @inheritdoc - def sort(self, cmp=None, key=None, reverse=None): - item = self._render() - kwargs = {} - if cmp is not None: - kwargs["cmp"] = cmp - if key is not None: - kwargs["key"] = key - if reverse is not None: - kwargs["reverse"] = reverse - item.sort(**kwargs) - self._parent[self._start:self._stop:self._step] = item - - -del inheritdoc diff --git a/mwparserfromhell/smart_list/ListProxy.py b/mwparserfromhell/smart_list/ListProxy.py new file mode 100644 index 0000000..35b45dc --- /dev/null +++ b/mwparserfromhell/smart_list/ListProxy.py @@ -0,0 +1,233 @@ +# +# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2019-2020 Yuri Astrakhan +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# SmartList has to be a full import in order to avoid cyclical import errors +import mwparserfromhell.smart_list.SmartList +from .utils import _SliceNormalizerMixIn, inheritdoc + + +class _ListProxy(_SliceNormalizerMixIn, list): + """Implement the ``list`` interface by getting elements from a parent. + + This is created by a :class:`.SmartList` object when slicing. It does not + actually store the list at any time; instead, whenever the list is needed, + it builds it dynamically using the :meth:`_render` method. + """ + + def __init__(self, parent, sliceinfo): + super().__init__() + self._parent = parent + self._sliceinfo = sliceinfo + + def __repr__(self): + return repr(self._render()) + + def __lt__(self, other): + if isinstance(other, _ListProxy): + return self._render() < list(other) + return self._render() < other + + def __le__(self, other): + if isinstance(other, _ListProxy): + return self._render() <= list(other) + return self._render() <= other + + def __eq__(self, other): + if isinstance(other, _ListProxy): + return self._render() == list(other) + return self._render() == other + + def __ne__(self, other): + if isinstance(other, _ListProxy): + return self._render() != list(other) + return self._render() != other + + def __gt__(self, other): + if isinstance(other, _ListProxy): + return self._render() > list(other) + return self._render() > other + + def __ge__(self, other): + if isinstance(other, _ListProxy): + return self._render() >= list(other) + return self._render() >= other + + def __bool__(self): + return bool(self._render()) + + def __len__(self): + return max((self._stop - self._start) // self._step, 0) + + def __getitem__(self, key): + if isinstance(key, slice): + key = self._normalize_slice(key, clamp=True) + keystart = min(self._start + key.start, self._stop) + keystop = min(self._start + key.stop, self._stop) + adjusted = slice(keystart, keystop, key.step) + return self._parent[adjusted] + else: + return self._render()[key] + + def __setitem__(self, key, item): + if isinstance(key, slice): + key = self._normalize_slice(key, clamp=True) + keystart = min(self._start + key.start, self._stop) + keystop = min(self._start + key.stop, self._stop) + adjusted = slice(keystart, keystop, key.step) + self._parent[adjusted] = item + else: + length = len(self) + if key < 0: + key = length + key + if key < 0 or key >= length: + raise IndexError("list assignment index out of range") + self._parent[self._start + key] = item + + def __delitem__(self, key): + if isinstance(key, slice): + key = self._normalize_slice(key, clamp=True) + keystart = min(self._start + key.start, self._stop) + keystop = min(self._start + key.stop, self._stop) + adjusted = slice(keystart, keystop, key.step) + del self._parent[adjusted] + else: + length = len(self) + if key < 0: + key = length + key + if key < 0 or key >= length: + raise IndexError("list assignment index out of range") + del self._parent[self._start + key] + + def __iter__(self): + i = self._start + while i < self._stop: + yield self._parent[i] + i += self._step + + def __reversed__(self): + i = self._stop - 1 + while i >= self._start: + yield self._parent[i] + i -= self._step + + def __contains__(self, item): + return item in self._render() + + def __add__(self, other): + return mwparserfromhell.smart_list.SmartList(list(self) + other) + + def __radd__(self, other): + return mwparserfromhell.smart_list.SmartList(other + list(self)) + + def __iadd__(self, other): + self.extend(other) + return self + + def __mul__(self, other): + return mwparserfromhell.smart_list.SmartList(list(self) * other) + + def __rmul__(self, other): + return mwparserfromhell.smart_list.SmartList(other * list(self)) + + def __imul__(self, other): + self.extend(list(self) * (other - 1)) + return self + + @property + def _start(self): + """The starting index of this list, inclusive.""" + return self._sliceinfo[0] + + @property + def _stop(self): + """The ending index of this list, exclusive.""" + if self._sliceinfo[1] is None: + return len(self._parent) + return self._sliceinfo[1] + + @property + def _step(self): + """The number to increase the index by between items.""" + return self._sliceinfo[2] + + def _render(self): + """Return the actual list from the stored start/stop/step.""" + return list(self._parent)[self._start:self._stop:self._step] + + @inheritdoc + def append(self, item): + self._parent.insert(self._stop, item) + + @inheritdoc + def count(self, item): + return self._render().count(item) + + @inheritdoc + def index(self, item, start=None, stop=None): + if start is not None: + if stop is not None: + return self._render().index(item, start, stop) + return self._render().index(item, start) + return self._render().index(item) + + @inheritdoc + def extend(self, item): + self._parent[self._stop:self._stop] = item + + @inheritdoc + def insert(self, index, item): + if index < 0: + index = len(self) + index + self._parent.insert(self._start + index, item) + + @inheritdoc + def pop(self, index=None): + length = len(self) + if index is None: + index = length - 1 + elif index < 0: + index = length + index + if index < 0 or index >= length: + raise IndexError("pop index out of range") + return self._parent.pop(self._start + index) + + @inheritdoc + def remove(self, item): + index = self.index(item) + del self._parent[self._start + index] + + @inheritdoc + def reverse(self): + item = self._render() + item.reverse() + self._parent[self._start:self._stop:self._step] = item + + @inheritdoc + def sort(self, key=None, reverse=None): + item = self._render() + kwargs = {} + if key is not None: + kwargs["key"] = key + if reverse is not None: + kwargs["reverse"] = reverse + item.sort(**kwargs) + self._parent[self._start:self._stop:self._step] = item diff --git a/mwparserfromhell/smart_list/SmartList.py b/mwparserfromhell/smart_list/SmartList.py new file mode 100644 index 0000000..c2e83a4 --- /dev/null +++ b/mwparserfromhell/smart_list/SmartList.py @@ -0,0 +1,157 @@ +# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2019-2020 Yuri Astrakhan +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from _weakref import ref + +from .ListProxy import _ListProxy +from .utils import _SliceNormalizerMixIn, inheritdoc + + +class SmartList(_SliceNormalizerMixIn, list): + """Implements the ``list`` interface with special handling of sublists. + + When a sublist is created (by ``list[i:j]``), any changes made to this + list (such as the addition, removal, or replacement of elements) will be + reflected in the sublist, or vice-versa, to the greatest degree possible. + This is implemented by having sublists - instances of the + :class:`._ListProxy` type - dynamically determine their elements by storing + their slice info and retrieving that slice from the parent. Methods that + change the size of the list also change the slice info. For example:: + + >>> parent = SmartList([0, 1, 2, 3]) + >>> parent + [0, 1, 2, 3] + >>> child = parent[2:] + >>> child + [2, 3] + >>> child.append(4) + >>> child + [2, 3, 4] + >>> parent + [0, 1, 2, 3, 4] + """ + + def __init__(self, iterable=None): + if iterable: + super().__init__(iterable) + else: + super().__init__() + self._children = {} + + def __getitem__(self, key): + if not isinstance(key, slice): + return super().__getitem__(key) + key = self._normalize_slice(key, clamp=False) + sliceinfo = [key.start, key.stop, key.step] + child = _ListProxy(self, sliceinfo) + child_ref = ref(child, self._delete_child) + self._children[id(child_ref)] = (child_ref, sliceinfo) + return child + + def __setitem__(self, key, item): + if not isinstance(key, slice): + return super().__setitem__(key, item) + item = list(item) + super().__setitem__(key, item) + key = self._normalize_slice(key, clamp=True) + diff = len(item) + (key.start - key.stop) // key.step + if not diff: + return + for child, (start, stop, step) in self._children.values(): + if start > key.stop: + self._children[id(child)][1][0] += diff + if stop is not None and stop >= key.stop: + self._children[id(child)][1][1] += diff + + def __delitem__(self, key): + super().__delitem__(key) + if isinstance(key, slice): + key = self._normalize_slice(key, clamp=True) + else: + key = slice(key, key + 1, 1) + diff = (key.stop - key.start) // key.step + for child, (start, stop, step) in self._children.values(): + if start > key.start: + self._children[id(child)][1][0] -= diff + if stop is not None and stop >= key.stop: + self._children[id(child)][1][1] -= diff + + def __add__(self, other): + return SmartList(list(self) + other) + + def __radd__(self, other): + return SmartList(other + list(self)) + + def __iadd__(self, other): + self.extend(other) + return self + + def _delete_child(self, child_ref): + """Remove a child reference that is about to be garbage-collected.""" + del self._children[id(child_ref)] + + def _detach_children(self): + """Remove all children and give them independent parent copies.""" + children = [val[0] for val in self._children.values()] + for child in children: + child()._parent = list(self) + self._children.clear() + + @inheritdoc + def append(self, item): + head = len(self) + self[head:head] = [item] + + @inheritdoc + def extend(self, item): + head = len(self) + self[head:head] = item + + @inheritdoc + def insert(self, index, item): + self[index:index] = [item] + + @inheritdoc + def pop(self, index=None): + if index is None: + index = len(self) - 1 + item = self[index] + del self[index] + return item + + @inheritdoc + def remove(self, item): + del self[self.index(item)] + + @inheritdoc + def reverse(self): + self._detach_children() + super().reverse() + + @inheritdoc + def sort(self, key=None, reverse=None): + self._detach_children() + kwargs = {} + if key is not None: + kwargs["key"] = key + if reverse is not None: + kwargs["reverse"] = reverse + super().sort(**kwargs) diff --git a/mwparserfromhell/smart_list/__init__.py b/mwparserfromhell/smart_list/__init__.py new file mode 100644 index 0000000..fdf7bd8 --- /dev/null +++ b/mwparserfromhell/smart_list/__init__.py @@ -0,0 +1,29 @@ +# +# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2019-2020 Yuri Astrakhan +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +This module contains the :class:`.SmartList` type, as well as its +:class:`._ListProxy` child, which together implement a list whose sublists +reflect changes made to the main list, and vice-versa. +""" + +from .SmartList import SmartList diff --git a/mwparserfromhell/smart_list/utils.py b/mwparserfromhell/smart_list/utils.py new file mode 100644 index 0000000..1a36d0b --- /dev/null +++ b/mwparserfromhell/smart_list/utils.py @@ -0,0 +1,50 @@ +# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2019-2020 Yuri Astrakhan +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from sys import maxsize + +__all__ = [] + + +def inheritdoc(method): + """Set __doc__ of *method* to __doc__ of *method* in its parent class. + + Since this is used on :class:`.SmartList`, the "parent class" used is + ``list``. This function can be used as a decorator. + """ + method.__doc__ = getattr(list, method.__name__).__doc__ + return method + + +class _SliceNormalizerMixIn: + """MixIn that provides a private method to normalize slices.""" + + def _normalize_slice(self, key, clamp=False): + """Return a slice equivalent to the input *key*, standardized.""" + if key.start is None: + start = 0 + else: + start = (len(self) + key.start) if key.start < 0 else key.start + if key.stop is None or key.stop == maxsize: + stop = len(self) if clamp else None + else: + stop = (len(self) + key.stop) if key.stop < 0 else key.stop + return slice(start, stop, key.step or 1) diff --git a/mwparserfromhell/string_mixin.py b/mwparserfromhell/string_mixin.py index 88898a1..564706d 100644 --- a/mwparserfromhell/string_mixin.py +++ b/mwparserfromhell/string_mixin.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -22,14 +21,11 @@ """ This module contains the :class:`.StringMixIn` type, which implements the -interface for the ``unicode`` type (``str`` on py3k) in a dynamic manner. +interface for the ``str`` type in a dynamic manner. """ -from __future__ import unicode_literals from sys import getdefaultencoding -from .compat import bytes, py26, py3k, str - __all__ = ["StringMixIn"] def inheritdoc(method): @@ -41,24 +37,20 @@ def inheritdoc(method): method.__doc__ = getattr(str, method.__name__).__doc__ return method -class StringMixIn(object): +class StringMixIn: """Implement the interface for ``unicode``/``str`` in a dynamic manner. To use this class, inherit from it and override the :meth:`__unicode__` - method (same on py3k) to return the string representation of the object. + method to return the string representation of the object. The various string methods will operate on the value of :meth:`__unicode__` instead of the immutable ``self`` like the regular ``str`` type. """ - if py3k: - def __str__(self): - return self.__unicode__() + def __str__(self): + return self.__unicode__() - def __bytes__(self): - return bytes(self.__unicode__(), getdefaultencoding()) - else: - def __str__(self): - return bytes(self.__unicode__()) + def __bytes__(self): + return bytes(self.__unicode__(), getdefaultencoding()) def __unicode__(self): raise NotImplementedError() @@ -84,19 +76,14 @@ class StringMixIn(object): def __ge__(self, other): return self.__unicode__() >= other - if py3k: - def __bool__(self): - return bool(self.__unicode__()) - else: - def __nonzero__(self): - return bool(self.__unicode__()) + def __bool__(self): + return bool(self.__unicode__()) def __len__(self): return len(self.__unicode__()) def __iter__(self): - for char in self.__unicode__(): - yield char + yield from self.__unicode__() def __getitem__(self, key): return self.__unicode__()[key] @@ -109,21 +96,11 @@ class StringMixIn(object): def __getattr__(self, attr): if not hasattr(str, attr): - raise AttributeError("{0!r} object has no attribute {1!r}".format( + raise AttributeError("{!r} object has no attribute {!r}".format( type(self).__name__, attr)) return getattr(self.__unicode__(), attr) - if py3k: - maketrans = str.maketrans # Static method can't rely on __getattr__ - - if py26: - @inheritdoc - def encode(self, encoding=None, errors=None): - if encoding is None: - encoding = getdefaultencoding() - if errors is not None: - return self.__unicode__().encode(encoding, errors) - return self.__unicode__().encode(encoding) + maketrans = str.maketrans # Static method can't rely on __getattr__ del inheritdoc diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 7387420..9e5e14b 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -25,9 +24,7 @@ This module contains accessory functions for other parts of the library. Parser users generally won't need stuff from here. """ -from __future__ import unicode_literals -from .compat import bytes, str from .nodes import Node from .smart_list import SmartList @@ -70,5 +67,5 @@ def parse_anything(value, context=0, skip_style_tags=False): nodelist += parse_anything(item, context, skip_style_tags).nodes return Wikicode(nodelist) except TypeError: - error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" + error = "Needs string, Node, Wikicode, file, int, None, or iterable of these, but got {0}: {1}" raise ValueError(error.format(type(value).__name__, value)) diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 4379b0a..f72c26b 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,13 +19,12 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals -from itertools import chain import re +from itertools import chain -from .compat import bytes, py3k, range, str from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Node, Tag, Template, Text, Wikilink) +from .smart_list.ListProxy import _ListProxy from .string_mixin import StringMixIn from .utils import parse_anything @@ -47,7 +45,7 @@ class Wikicode(StringMixIn): RECURSE_OTHERS = 2 def __init__(self, nodes): - super(Wikicode, self).__init__() + super().__init__() self._nodes = nodes def __unicode__(self): @@ -55,15 +53,14 @@ class Wikicode(StringMixIn): @staticmethod def _get_children(node, contexts=False, restrict=None, parent=None): - """Iterate over all child :class:`.Node`\ s of a given *node*.""" + """Iterate over all child :class:`.Node`\\ s of a given *node*.""" yield (parent, node) if contexts else node if restrict and isinstance(node, restrict): return for code in node.__children__(): for child in code.nodes: sub = Wikicode._get_children(child, contexts, restrict, code) - for result in sub: - yield result + yield from sub @staticmethod def _slice_replace(code, index, old, new): @@ -108,6 +105,26 @@ class Wikicode(StringMixIn): if (not forcetype or isinstance(node, forcetype)) and match(node): yield (i, node) + def _is_child_wikicode(self, obj, recursive=True): + """Return whether the given :class:`.Wikicode` is a descendant.""" + def deref(nodes): + if isinstance(nodes, _ListProxy): + return nodes._parent # pylint: disable=protected-access + return nodes + + target = deref(obj.nodes) + if target is deref(self.nodes): + return True + if recursive: + todo = [self] + while todo: + code = todo.pop() + if target is deref(code.nodes): + return True + for node in code.nodes: + todo += list(node.__children__()) + return False + def _do_strong_search(self, obj, recursive=True): """Search for the specific element *obj* within the node list. @@ -120,11 +137,16 @@ class Wikicode(StringMixIn): :class:`.Wikicode` contained by a node within ``self``. If *obj* is not found, :exc:`ValueError` is raised. """ + if isinstance(obj, Wikicode): + if not self._is_child_wikicode(obj, recursive): + raise ValueError(obj) + return obj, slice(0, len(obj.nodes)) + if isinstance(obj, Node): mkslice = lambda i: slice(i, i + 1) if not recursive: return self, mkslice(self.index(obj)) - for i, node in enumerate(self.nodes): + for node in self.nodes: for context, child in self._get_children(node, contexts=True): if obj is child: if not context: @@ -132,11 +154,7 @@ class Wikicode(StringMixIn): return context, mkslice(context.index(child)) raise ValueError(obj) - context, ind = self._do_strong_search(obj.get(0), recursive) - for i in range(1, len(obj.nodes)): - if obj.get(i) is not context.get(ind.start + i): - raise ValueError(obj) - return context, slice(ind.start, ind.start + len(obj.nodes)) + raise TypeError(obj) def _do_weak_search(self, obj, recursive): """Search for an element that looks like *obj* within the node list. @@ -230,7 +248,7 @@ class Wikicode(StringMixIn): self.ifilter(forcetype=ftype, *a, **kw)) make_filter = lambda ftype: (lambda self, *a, **kw: self.filter(forcetype=ftype, *a, **kw)) - for name, ftype in (meths.items() if py3k else meths.iteritems()): + for name, ftype in meths.items(): ifilter = make_ifilter(ftype) filter = make_filter(ftype) ifilter.__doc__ = doc.format(name, "ifilter", ftype) @@ -254,7 +272,7 @@ class Wikicode(StringMixIn): self._nodes = value def get(self, index): - """Return the *index*\ th node within the list of nodes.""" + """Return the *index*\\ th node within the list of nodes.""" return self.nodes[index] def set(self, index, value): @@ -479,16 +497,16 @@ class Wikicode(StringMixIn): letter's case is normalized. Typical usage is ``if template.name.matches("stub"): ...``. """ - cmp = lambda a, b: (a[0].upper() + a[1:] == b[0].upper() + b[1:] - if a and b else a == b) - this = self.strip_code().strip() + normalize = lambda s: (s[0].upper() + s[1:]).replace("_", " ") if s else s + this = normalize(self.strip_code().strip()) + if isinstance(other, (str, bytes, Wikicode, Node)): that = parse_anything(other).strip_code().strip() - return cmp(this, that) + return this == normalize(that) for obj in other: that = parse_anything(obj).strip_code().strip() - if cmp(this, that): + if this == normalize(that): return True return False diff --git a/scripts/memtest.py b/scripts/memtest.py index 823560d..f60e260 100644 --- a/scripts/memtest.py +++ b/scripts/memtest.py @@ -40,7 +40,6 @@ import sys import psutil -from mwparserfromhell.compat import py3k from mwparserfromhell.parser._tokenizer import CTokenizer if sys.version_info[0] == 2: @@ -80,7 +79,7 @@ class MemoryTest(object): raw = raw.encode("raw_unicode_escape") data["input"] = raw.decode("unicode_escape") number = str(counter).zfill(digits) - fname = "test_{0}{1}_{2}".format(name, number, data["name"]) + fname = "test_{}{}_{}".format(name, number, data["name"]) self._tests.append((fname, data["input"])) counter += 1 @@ -88,8 +87,6 @@ class MemoryTest(object): def load_file(filename): with open(filename, "rU") as fp: text = fp.read() - if not py3k: - text = text.decode("utf8") name = path.split(filename)[1][:0-len(extension)] self._parse_file(name, text) @@ -117,7 +114,7 @@ class MemoryTest(object): tmpl = "{0}LEAKING{1}: {2:n} bytes, {3:.2%} inc ({4:n} bytes/loop)" sys.stdout.write(tmpl.format(Color.YELLOW, Color.RESET, d, p, bpt)) else: - sys.stdout.write("{0}OK{1}".format(Color.GREEN, Color.RESET)) + sys.stdout.write("{}OK{}".format(Color.GREEN, Color.RESET)) def run(self): """Run the memory test suite.""" diff --git a/scripts/release.sh b/scripts/release.sh index 5dbefbe..f7143c8 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -1,7 +1,5 @@ #! /usr/bin/env bash -set -euo pipefail - if [[ -z "$1" ]]; then echo "usage: $0 1.2.3" exit 1 @@ -77,7 +75,8 @@ do_git_stuff() { upload_to_pypi() { echo -n "PyPI: uploading source tarball..." - python setup.py -q register sdist upload -s + python setup.py -q sdist + twine upload -s dist/mwparserfromhell-$VERSION* echo " done." } @@ -85,7 +84,7 @@ post_release() { echo echo "*** Release completed." echo "*** Update: https://github.com/earwig/mwparserfromhell/releases/tag/v$VERSION" - echo "*** Verify: https://pypi.python.org/pypi/mwparserfromhell" + echo "*** Verify: https://pypi.org/project/mwparserfromhell" echo "*** Verify: https://ci.appveyor.com/project/earwig/mwparserfromhell" echo "*** Verify: https://mwparserfromhell.readthedocs.io" echo "*** Press enter to sanity-check the release." @@ -97,7 +96,7 @@ test_release() { echo "Checking mwparserfromhell v$VERSION..." echo -n "Creating a virtualenv..." virtdir="mwparser-test-env" - virtualenv -q $virtdir + python -m venv $virtdir cd $virtdir source bin/activate echo " done." @@ -105,7 +104,7 @@ test_release() { pip -q install mwparserfromhell echo " done." echo -n "Checking version..." - reported_version=$(python -c 'print __import__("mwparserfromhell").__version__') + reported_version=$(python -c 'print(__import__("mwparserfromhell").__version__)') if [[ "$reported_version" != "$VERSION" ]]; then echo " error." echo "*** ERROR: mwparserfromhell is reporting its version as $reported_version, not $VERSION!" @@ -134,7 +133,8 @@ test_release() { rm mwparserfromhell.tar.gz mwparserfromhell.tar.gz.asc cd mwparserfromhell-$VERSION echo "Running unit tests..." - python setup.py -q test + python setup.py -q install + python -m unittest discover if [[ "$?" != "0" ]]; then echo "*** ERROR: Unit tests failed!" deactivate diff --git a/setup.py b/setup.py index 0b33d42..f339665 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ #! /usr/bin/env python -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2018 Ben Kurtovic # @@ -21,23 +20,17 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import print_function from distutils.errors import DistutilsError, CCompilerError from glob import glob from os import environ import sys -if ((sys.version_info[0] == 2 and sys.version_info[1] < 6) or - (sys.version_info[1] == 3 and sys.version_info[1] < 2)): - raise RuntimeError("mwparserfromhell needs Python 2.6+ or 3.2+") - from setuptools import setup, find_packages, Extension from setuptools.command.build_ext import build_ext from mwparserfromhell import __version__ -from mwparserfromhell.compat import py26, py3k -with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp: +with open("README.rst", encoding='utf-8') as fp: long_docs = fp.read() use_extension = True @@ -76,21 +69,21 @@ if fallback: tokenizer = Extension("mwparserfromhell.parser._tokenizer", sources=sorted(glob("mwparserfromhell/parser/ctokenizer/*.c")), - depends=glob("mwparserfromhell/parser/ctokenizer/*.h")) + depends=sorted(glob("mwparserfromhell/parser/ctokenizer/*.h"))) setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)), ext_modules = [tokenizer] if use_extension else [], - tests_require = ["unittest2"] if py26 else [], - test_suite = "tests.discover", + test_suite = "tests", version = __version__, + python_requires = ">= 3.4", author = "Ben Kurtovic", author_email = "ben.kurtovic@gmail.com", url = "https://github.com/earwig/mwparserfromhell", description = "MWParserFromHell is a parser for MediaWiki wikicode.", long_description = long_docs, - download_url = "https://github.com/earwig/mwparserfromhell/tarball/v{0}".format(__version__), + download_url = "https://github.com/earwig/mwparserfromhell/tarball/v{}".format(__version__), keywords = "earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing", license = "MIT License", classifiers = [ @@ -99,15 +92,12 @@ setup( "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Topic :: Text Processing :: Markup" ], ) diff --git a/tests/__init__.py b/tests/__init__.py index 89907bf..e69de29 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index d025625..f61cb10 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,12 +19,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import print_function, unicode_literals import codecs from os import listdir, path import sys +import warnings -from mwparserfromhell.compat import py3k, str from mwparserfromhell.parser import tokens from mwparserfromhell.parser.builder import Builder @@ -34,7 +32,7 @@ class _TestParseError(Exception): pass -class TokenizerTestCase(object): +class TokenizerTestCase: """A base test case for tokenizers, whose tests are loaded dynamically. Subclassed along with unittest.TestCase to form TestPyTokenizer and @@ -59,8 +57,6 @@ class TokenizerTestCase(object): actual = self.tokenizer().tokenize(data["input"]) self.assertEqual(expected, actual) - if not py3k: - inner.__name__ = funcname.encode("utf8") inner.__doc__ = data["label"] return inner @@ -98,19 +94,19 @@ class TokenizerTestCase(object): except _TestParseError as err: if data["name"]: error = "Could not parse test '{0}' in '{1}':\n\t{2}" - print(error.format(data["name"], filename, err)) + warnings.warn(error.format(data["name"], filename, err)) else: error = "Could not parse a test in '{0}':\n\t{1}" - print(error.format(filename, err)) + warnings.warn(error.format(filename, err)) continue if not data["name"]: error = "A test in '{0}' was ignored because it lacked a name" - print(error.format(filename)) + warnings.warn(error.format(filename)) continue if data["input"] is None or data["output"] is None: - error = "Test '{0}' in '{1}' was ignored because it lacked an input or an output" - print(error.format(data["name"], filename)) + error = "Test '{}' in '{}' was ignored because it lacked an input or an output" + warnings.warn(error.format(data["name"], filename)) continue number = str(counter).zfill(digits) @@ -118,7 +114,7 @@ class TokenizerTestCase(object): if restrict and data["name"] != restrict: continue - fname = "test_{0}{1}_{2}".format(name, number, data["name"]) + fname = "test_{}{}_{}".format(name, number, data["name"]) meth = cls._build_test_method(fname, data) setattr(cls, fname, meth) @@ -126,7 +122,7 @@ class TokenizerTestCase(object): def build(cls): """Load and install all tests from the 'tokenizer' directory.""" def load_file(filename, restrict=None): - with codecs.open(filename, "rU", encoding="utf8") as fp: + with codecs.open(filename, "r", encoding="utf8") as fp: text = fp.read() name = path.split(filename)[1][:-len(extension)] cls._load_tests(filename, name, text, restrict) diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py index fe626ce..cdfbd3a 100644 --- a/tests/_test_tree_equality.py +++ b/tests/_test_tree_equality.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +from unittest import TestCase -try: - from unittest2 import TestCase -except ImportError: - from unittest import TestCase - -from mwparserfromhell.compat import range from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter @@ -71,7 +64,7 @@ class TreeEqualityTestCase(TestCase): def assertCommentNodeEqual(self, expected, actual): """Assert that two Comment nodes have the same data.""" - self.assertWikicodeEqual(expected.contents, actual.contents) + self.assertEqual(expected.contents, actual.contents) def assertHeadingNodeEqual(self, expected, actual): """Assert that two Heading nodes have the same data.""" diff --git a/tests/compat.py b/tests/compat.py deleted file mode 100644 index d5b3fba..0000000 --- a/tests/compat.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Serves the same purpose as mwparserfromhell.compat, but only for objects -required by unit tests. This avoids unnecessary imports (like urllib) within -the main library. -""" - -from mwparserfromhell.compat import py3k - -if py3k: - from io import StringIO - from urllib.parse import urlencode - from urllib.request import urlopen - -else: - from StringIO import StringIO - from urllib import urlencode, urlopen diff --git a/tests/discover.py b/tests/discover.py deleted file mode 100644 index 6bb971b..0000000 --- a/tests/discover.py +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Discover tests using ``unittest2` for Python 2.6. - -It appears the default distutils test suite doesn't play nice with -``setUpClass`` thereby making some tests fail. Using ``unittest2`` to load -tests seems to work around that issue. - -http://stackoverflow.com/a/17004409/753501 -""" - -import os.path - -from mwparserfromhell.compat import py26 - -if py26: - import unittest2 as unittest -else: - import unittest - -def additional_tests(): - project_root = os.path.split(os.path.dirname(__file__))[0] - return unittest.defaultTestLoader.discover(project_root) diff --git a/tests/test_argument.py b/tests/test_argument.py index 6209b2f..eaf8abe 100644 --- a/tests/test_argument.py +++ b/tests/test_argument.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import Argument, Text from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext diff --git a/tests/test_attribute.py b/tests/test_attribute.py index 7fe5772..b0d0e85 100644 --- a/tests/test_attribute.py +++ b/tests/test_attribute.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import Template from mwparserfromhell.nodes.extras import Attribute diff --git a/tests/test_builder.py b/tests/test_builder.py index eed5861..e5f43aa 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import py3k from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter @@ -236,11 +229,11 @@ class TestBuilder(TreeEqualityTestCase): tests = [ ([tokens.CommentStart(), tokens.Text(text="foobar"), tokens.CommentEnd()], - wrap([Comment(wraptext("foobar"))])), + wrap([Comment("foobar")])), ([tokens.CommentStart(), tokens.Text(text="spam"), tokens.Text(text="eggs"), tokens.CommentEnd()], - wrap([Comment(wraptext("spam", "eggs"))])), + wrap([Comment("spameggs")])), ] for test, valid in tests: self.assertWikicodeEqual(valid, self.builder.build(test)) @@ -416,7 +409,7 @@ class TestBuilder(TreeEqualityTestCase): wraptext("c"), params=[Parameter(wraptext("1"), wrap([Wikilink( wraptext("d")), Argument(wraptext("e"))]), showkey=False)])]), showkey=False)]), Wikilink(wraptext("f"), wrap([Argument(wraptext( - "g")), Comment(wraptext("h"))])), Template(wraptext("i"), params=[ + "g")), Comment("h")])), Template(wraptext("i"), params=[ Parameter(wraptext("j"), wrap([HTMLEntity("nbsp", named=True)]))])]) self.assertWikicodeEqual(valid, self.builder.build(test)) @@ -432,9 +425,8 @@ class TestBuilder(TreeEqualityTestCase): [tokens.TagOpenOpen()] ] - func = self.assertRaisesRegex if py3k else self.assertRaisesRegexp msg = r"_handle_token\(\) got unexpected TemplateClose" - func(ParserError, msg, self.builder.build, [tokens.TemplateClose()]) + self.assertRaisesRegex(ParserError, msg, self.builder.build, [tokens.TemplateClose()]) for test in missing_closes: self.assertRaises(ParserError, self.builder.build, test) diff --git a/tests/test_comment.py b/tests/test_comment.py index 27129c9..1024e60 100644 --- a/tests/test_comment.py +++ b/tests/test_comment.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import Comment from ._test_tree_equality import TreeEqualityTestCase diff --git a/tests/test_ctokenizer.py b/tests/test_ctokenizer.py index 27ff237..f9b8d2f 100644 --- a/tests/test_ctokenizer.py +++ b/tests/test_ctokenizer.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,12 +19,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals - -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest try: from mwparserfromhell.parser._tokenizer import CTokenizer diff --git a/tests/test_docs.py b/tests/test_docs.py index 398be4c..2e78106 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,19 +19,14 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import print_function, unicode_literals import json +from io import StringIO import os - -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest +from urllib.parse import urlencode +from urllib.request import urlopen import mwparserfromhell -from mwparserfromhell.compat import py3k, str - -from .compat import StringIO, urlencode, urlopen class TestDocs(unittest.TestCase): """Integration test cases for mwparserfromhell's documentation.""" @@ -51,16 +45,10 @@ class TestDocs(unittest.TestCase): self.assertPrint(wikicode, "I has a template! {{foo|bar|baz|eggs=spam}} See it?") templates = wikicode.filter_templates() - if py3k: - self.assertPrint(templates, "['{{foo|bar|baz|eggs=spam}}']") - else: - self.assertPrint(templates, "[u'{{foo|bar|baz|eggs=spam}}']") + self.assertPrint(templates, "['{{foo|bar|baz|eggs=spam}}']") template = templates[0] self.assertPrint(template.name, "foo") - if py3k: - self.assertPrint(template.params, "['bar', 'baz', 'eggs=spam']") - else: - self.assertPrint(template.params, "[u'bar', u'baz', u'eggs=spam']") + self.assertPrint(template.params, "['bar', 'baz', 'eggs=spam']") self.assertPrint(template.get(1).value, "bar") self.assertPrint(template.get("eggs").value, "spam") @@ -68,21 +56,14 @@ class TestDocs(unittest.TestCase): """test a block of example code in the README""" text = "{{foo|{{bar}}={{baz|{{spam}}}}}}" temps = mwparserfromhell.parse(text).filter_templates() - if py3k: - res = "['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}']" - else: - res = "[u'{{foo|{{bar}}={{baz|{{spam}}}}}}', u'{{bar}}', u'{{baz|{{spam}}}}', u'{{spam}}']" + res = "['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}']" self.assertPrint(temps, res) def test_readme_3(self): """test a block of example code in the README""" code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}") - if py3k: - self.assertPrint(code.filter_templates(recursive=False), - "['{{foo|this {{includes a|template}}}}']") - else: - self.assertPrint(code.filter_templates(recursive=False), - "[u'{{foo|this {{includes a|template}}}}']") + self.assertPrint(code.filter_templates(recursive=False), + "['{{foo|this {{includes a|template}}}}']") foo = code.filter_templates(recursive=False)[0] self.assertPrint(foo.get(1).value, "this {{includes a|template}}") self.assertPrint(foo.get(1).value.filter_templates()[0], @@ -102,10 +83,7 @@ class TestDocs(unittest.TestCase): code.replace("{{uncategorized}}", "{{bar-stub}}") res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}" self.assertPrint(code, res) - if py3k: - res = "['{{cleanup|date=July 2012}}', '{{bar-stub}}']" - else: - res = "[u'{{cleanup|date=July 2012}}', u'{{bar-stub}}']" + res = "['{{cleanup|date=July 2012}}', '{{bar-stub}}']" self.assertPrint(code.filter_templates(), res) text = str(code) res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}" @@ -118,17 +96,26 @@ class TestDocs(unittest.TestCase): url1 = "https://en.wikipedia.org/w/api.php" url2 = "https://en.wikipedia.org/w/index.php?title={0}&action=raw" title = "Test" - data = {"action": "query", "prop": "revisions", "rvlimit": 1, - "rvprop": "content", "format": "json", "titles": title} + data = { + "action": "query", + "prop": "revisions", + "rvprop": "content", + "rvslots": "main", + "rvlimit": 1, + "titles": title, + "format": "json", + "formatversion": "2", + } try: raw = urlopen(url1, urlencode(data).encode("utf8")).read() - except IOError: + except OSError: self.skipTest("cannot continue because of unsuccessful web call") res = json.loads(raw.decode("utf8")) - text = list(res["query"]["pages"].values())[0]["revisions"][0]["*"] + revision = res["query"]["pages"][0]["revisions"][0] + text = revision["slots"]["main"]["content"] try: expected = urlopen(url2.format(title)).read().decode("utf8") - except IOError: + except OSError: self.skipTest("cannot continue because of unsuccessful web call") actual = mwparserfromhell.parse(text) self.assertEqual(expected, actual) diff --git a/tests/test_external_link.py b/tests/test_external_link.py index 8cb3158..48a7b82 100644 --- a/tests/test_external_link.py +++ b/tests/test_external_link.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import ExternalLink, Text from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext diff --git a/tests/test_heading.py b/tests/test_heading.py index 5e6776a..46c6258 100644 --- a/tests/test_heading.py +++ b/tests/test_heading.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import Heading, Text from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext diff --git a/tests/test_html_entity.py b/tests/test_html_entity.py index 4db1c13..273ee21 100644 --- a/tests/test_html_entity.py +++ b/tests/test_html_entity.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import HTMLEntity from ._test_tree_equality import TreeEqualityTestCase, wrap diff --git a/tests/test_parameter.py b/tests/test_parameter.py index 44c30af..d53c7af 100644 --- a/tests/test_parameter.py +++ b/tests/test_parameter.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import Text from mwparserfromhell.nodes.extras import Parameter diff --git a/tests/test_parser.py b/tests/test_parser.py index d586ecd..22a76f6 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,15 +19,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals - -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest from mwparserfromhell import parser -from mwparserfromhell.compat import range from mwparserfromhell.nodes import Tag, Template, Text, Wikilink from mwparserfromhell.nodes.extras import Parameter diff --git a/tests/test_pytokenizer.py b/tests/test_pytokenizer.py index f7f26b8..9fd0c3e 100644 --- a/tests/test_pytokenizer.py +++ b/tests/test_pytokenizer.py @@ -1,6 +1,5 @@ -# -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,13 +19,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals - -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest +from mwparserfromhell.parser import contexts from mwparserfromhell.parser.tokenizer import Tokenizer from ._test_tokenizer import TokenizerTestCase @@ -44,5 +39,10 @@ class TestPyTokenizer(TokenizerTestCase, unittest.TestCase): self.assertFalse(Tokenizer.USES_C) self.assertFalse(Tokenizer().USES_C) + def test_describe_context(self): + self.assertEqual("", contexts.describe(0)) + ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY|contexts.HAS_TEXT) + self.assertEqual("TEMPLATE_PARAM_KEY|HAS_TEXT", ctx) + if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/tests/test_roundtripping.py b/tests/test_roundtripping.py index a217e21..9ecd5bd 100644 --- a/tests/test_roundtripping.py +++ b/tests/test_roundtripping.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,12 +19,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals - -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest from ._test_tokenizer import TokenizerTestCase diff --git a/tests/test_smart_list.py b/tests/test_smart_list.py index 3de7db7..16d99e7 100644 --- a/tests/test_smart_list.py +++ b/tests/test_smart_list.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,15 +19,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest +from mwparserfromhell.smart_list import SmartList +from mwparserfromhell.smart_list.ListProxy import _ListProxy -from mwparserfromhell.compat import py3k, range -from mwparserfromhell.smart_list import SmartList, _ListProxy class TestSmartList(unittest.TestCase): """Test cases for the SmartList class and its child, _ListProxy.""" @@ -130,45 +125,40 @@ class TestSmartList(unittest.TestCase): list3 = builder([0, 2, 3, 4]) list4 = builder([0, 1, 2]) - if py3k: - self.assertEqual("[0, 1, 2, 3, 'one', 'two']", str(list1)) - self.assertEqual(b"\x00\x01\x02", bytes(list4)) - self.assertEqual("[0, 1, 2, 3, 'one', 'two']", repr(list1)) - else: - self.assertEqual("[0, 1, 2, 3, u'one', u'two']", unicode(list1)) - self.assertEqual(b"[0, 1, 2, 3, u'one', u'two']", str(list1)) - self.assertEqual(b"[0, 1, 2, 3, u'one', u'two']", repr(list1)) - - self.assertTrue(list1 < list3) - self.assertTrue(list1 <= list3) - self.assertFalse(list1 == list3) - self.assertTrue(list1 != list3) - self.assertFalse(list1 > list3) - self.assertFalse(list1 >= list3) + self.assertEqual("[0, 1, 2, 3, 'one', 'two']", str(list1)) + self.assertEqual(b"\x00\x01\x02", bytes(list4)) + self.assertEqual("[0, 1, 2, 3, 'one', 'two']", repr(list1)) + + self.assertLess(list1, list3) + self.assertLessEqual(list1, list3) + self.assertNotEqual(list1, list3) + self.assertNotEqual(list1, list3) + self.assertLessEqual(list1, list3) + self.assertLess(list1, list3) other1 = [0, 2, 3, 4] - self.assertTrue(list1 < other1) - self.assertTrue(list1 <= other1) - self.assertFalse(list1 == other1) - self.assertTrue(list1 != other1) - self.assertFalse(list1 > other1) - self.assertFalse(list1 >= other1) + self.assertLess(list1, other1) + self.assertLessEqual(list1, other1) + self.assertNotEqual(list1, other1) + self.assertNotEqual(list1, other1) + self.assertLessEqual(list1, other1) + self.assertLess(list1, other1) other2 = [0, 0, 1, 2] - self.assertFalse(list1 < other2) - self.assertFalse(list1 <= other2) - self.assertFalse(list1 == other2) - self.assertTrue(list1 != other2) - self.assertTrue(list1 > other2) - self.assertTrue(list1 >= other2) + self.assertGreaterEqual(list1, other2) + self.assertGreater(list1, other2) + self.assertNotEqual(list1, other2) + self.assertNotEqual(list1, other2) + self.assertGreater(list1, other2) + self.assertGreaterEqual(list1, other2) other3 = [0, 1, 2, 3, "one", "two"] - self.assertFalse(list1 < other3) - self.assertTrue(list1 <= other3) - self.assertTrue(list1 == other3) - self.assertFalse(list1 != other3) - self.assertFalse(list1 > other3) - self.assertTrue(list1 >= other3) + self.assertGreaterEqual(list1, other3) + self.assertLessEqual(list1, other3) + self.assertEqual(list1, other3) + self.assertEqual(list1, other3) + self.assertLessEqual(list1, other3) + self.assertGreaterEqual(list1, other3) self.assertTrue(bool(list1)) self.assertFalse(bool(list2)) @@ -198,10 +188,10 @@ class TestSmartList(unittest.TestCase): self.assertEqual(["two", "one", 3, 2, 1, 0], list(reversed(list1))) self.assertEqual([], list(reversed(list2))) - self.assertTrue("one" in list1) - self.assertTrue(3 in list1) - self.assertFalse(10 in list1) - self.assertFalse(0 in list2) + self.assertIn("one", list1) + self.assertIn(3, list1) + self.assertNotIn(10, list1) + self.assertNotIn(0, list2) self.assertEqual([], list2 * 5) self.assertEqual([], 5 * list2) @@ -265,12 +255,6 @@ class TestSmartList(unittest.TestCase): self.assertEqual([0, 2, 2, 3, 4, 5], list1) list1.sort(reverse=True) self.assertEqual([5, 4, 3, 2, 2, 0], list1) - if not py3k: - func = lambda x, y: abs(3 - x) - abs(3 - y) # Distance from 3 - list1.sort(cmp=func) - self.assertEqual([3, 4, 2, 2, 5, 0], list1) - list1.sort(cmp=func, reverse=True) - self.assertEqual([0, 5, 4, 2, 2, 3], list1) list3.sort(key=lambda i: i[1]) self.assertEqual([("d", 2), ("c", 3), ("a", 5), ("b", 8)], list3) list3.sort(key=lambda i: i[1], reverse=True) diff --git a/tests/test_string_mixin.py b/tests/test_string_mixin.py index 08d5b9e..673d5fa 100644 --- a/tests/test_string_mixin.py +++ b/tests/test_string_mixin.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,16 +19,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from sys import getdefaultencoding from types import GeneratorType +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import bytes, py3k, py32, range, str from mwparserfromhell.string_mixin import StringMixIn class _FakeString(StringMixIn): @@ -46,20 +39,16 @@ class TestStringMixIn(unittest.TestCase): def test_docs(self): """make sure the various methods of StringMixIn have docstrings""" methods = [ - "capitalize", "center", "count", "encode", "endswith", - "expandtabs", "find", "format", "index", "isalnum", "isalpha", - "isdecimal", "isdigit", "islower", "isnumeric", "isspace", - "istitle", "isupper", "join", "ljust", "lower", "lstrip", - "partition", "replace", "rfind", "rindex", "rjust", "rpartition", - "rsplit", "rstrip", "split", "splitlines", "startswith", "strip", - "swapcase", "title", "translate", "upper", "zfill"] - if py3k: - if not py32: - methods.append("casefold") - methods.extend(["format_map", "isidentifier", "isprintable", - "maketrans"]) - else: - methods.append("decode") + "capitalize", "casefold", "center", "count", "encode", "endswith", + "expandtabs", "find", "format", "format_map", "index", "isalnum", + "isalpha", "isdecimal", "isdigit", "isidentifier", "islower", + "isnumeric", "isprintable", "isspace", "istitle", "isupper", + "join", "ljust", "lower", "lstrip", "maketrans", "partition", + "replace", "rfind", "rindex", "rjust", "rpartition", "rsplit", + "rstrip", "split", "splitlines", "startswith", "strip", "swapcase", + "title", "translate", "upper", "zfill" + ] + for meth in methods: expected = getattr("foo", meth).__doc__ actual = getattr(_FakeString("foo"), meth).__doc__ @@ -70,17 +59,11 @@ class TestStringMixIn(unittest.TestCase): fstr = _FakeString("fake string") self.assertEqual(str(fstr), "fake string") self.assertEqual(bytes(fstr), b"fake string") - if py3k: - self.assertEqual(repr(fstr), "'fake string'") - else: - self.assertEqual(repr(fstr), b"u'fake string'") + self.assertEqual(repr(fstr), "'fake string'") self.assertIsInstance(str(fstr), str) self.assertIsInstance(bytes(fstr), bytes) - if py3k: - self.assertIsInstance(repr(fstr), str) - else: - self.assertIsInstance(repr(fstr), bytes) + self.assertIsInstance(repr(fstr), str) def test_comparisons(self): """make sure comparison operators work""" @@ -90,33 +73,33 @@ class TestStringMixIn(unittest.TestCase): str4 = "this is a fake string" str5 = "fake string, this is" - self.assertFalse(str1 > str2) - self.assertTrue(str1 >= str2) - self.assertTrue(str1 == str2) - self.assertFalse(str1 != str2) - self.assertFalse(str1 < str2) - self.assertTrue(str1 <= str2) - - self.assertTrue(str1 > str3) - self.assertTrue(str1 >= str3) - self.assertFalse(str1 == str3) - self.assertTrue(str1 != str3) - self.assertFalse(str1 < str3) - self.assertFalse(str1 <= str3) - - self.assertFalse(str1 > str4) - self.assertTrue(str1 >= str4) - self.assertTrue(str1 == str4) - self.assertFalse(str1 != str4) - self.assertFalse(str1 < str4) - self.assertTrue(str1 <= str4) - - self.assertFalse(str5 > str1) - self.assertFalse(str5 >= str1) - self.assertFalse(str5 == str1) - self.assertTrue(str5 != str1) - self.assertTrue(str5 < str1) - self.assertTrue(str5 <= str1) + self.assertLessEqual(str1, str2) + self.assertGreaterEqual(str1, str2) + self.assertEqual(str1, str2) + self.assertEqual(str1, str2) + self.assertGreaterEqual(str1, str2) + self.assertLessEqual(str1, str2) + + self.assertGreater(str1, str3) + self.assertGreaterEqual(str1, str3) + self.assertNotEqual(str1, str3) + self.assertNotEqual(str1, str3) + self.assertGreaterEqual(str1, str3) + self.assertGreater(str1, str3) + + self.assertLessEqual(str1, str4) + self.assertGreaterEqual(str1, str4) + self.assertEqual(str1, str4) + self.assertEqual(str1, str4) + self.assertGreaterEqual(str1, str4) + self.assertLessEqual(str1, str4) + + self.assertLessEqual(str5, str1) + self.assertLess(str5, str1) + self.assertNotEqual(str5, str1) + self.assertNotEqual(str5, str1) + self.assertLess(str5, str1) + self.assertLessEqual(str5, str1) def test_other_magics(self): """test other magically implemented features, like len() and iter()""" @@ -161,13 +144,13 @@ class TestStringMixIn(unittest.TestCase): self.assertRaises(IndexError, lambda: str1[11]) self.assertRaises(IndexError, lambda: str2[0]) - self.assertTrue("k" in str1) - self.assertTrue("fake" in str1) - self.assertTrue("str" in str1) - self.assertTrue("" in str1) - self.assertTrue("" in str2) - self.assertFalse("real" in str1) - self.assertFalse("s" in str2) + self.assertIn("k", str1) + self.assertIn("fake", str1) + self.assertIn("str", str1) + self.assertIn("", str1) + self.assertIn("", str2) + self.assertNotIn("real", str1) + self.assertNotIn("s", str2) def test_other_methods(self): """test the remaining non-magic methods of StringMixIn""" @@ -185,14 +168,6 @@ class TestStringMixIn(unittest.TestCase): self.assertEqual(1, str1.count("r", 5, 9)) self.assertEqual(0, str1.count("r", 5, 7)) - if not py3k: - str2 = _FakeString("fo") - self.assertEqual(str1, str1.decode()) - actual = _FakeString("\\U00010332\\U0001033f\\U00010344") - self.assertEqual("𐌲𐌿𐍄", actual.decode("unicode_escape")) - self.assertRaises(UnicodeError, str2.decode, "punycode") - self.assertEqual("", str2.decode("punycode", "ignore")) - str3 = _FakeString("𐌲𐌿𐍄") actual = b"\xF0\x90\x8C\xB2\xF0\x90\x8C\xBF\xF0\x90\x8D\x84" self.assertEqual(b"fake string", str1.encode()) @@ -239,10 +214,9 @@ class TestStringMixIn(unittest.TestCase): self.assertEqual("foobarbazbuzz", str7.format("bar", abc="baz")) self.assertRaises(IndexError, str8.format, "abc") - if py3k: - self.assertEqual("fake string", str1.format_map({})) - self.assertEqual("foobarbaz", str6.format_map({"abc": "bar"})) - self.assertRaises(ValueError, str5.format_map, {0: "abc"}) + self.assertEqual("fake string", str1.format_map({})) + self.assertEqual("foobarbaz", str6.format_map({"abc": "bar"})) + self.assertRaises(ValueError, str5.format_map, {0: "abc"}) self.assertEqual(3, str1.index("e")) self.assertRaises(ValueError, str1.index, "z") @@ -275,11 +249,10 @@ class TestStringMixIn(unittest.TestCase): self.assertFalse(str13.isdigit()) self.assertTrue(str14.isdigit()) - if py3k: - self.assertTrue(str9.isidentifier()) - self.assertTrue(str10.isidentifier()) - self.assertFalse(str11.isidentifier()) - self.assertFalse(str12.isidentifier()) + self.assertTrue(str9.isidentifier()) + self.assertTrue(str10.isidentifier()) + self.assertFalse(str11.isidentifier()) + self.assertFalse(str12.isidentifier()) str15 = _FakeString("") str16 = _FakeString("FooBar") @@ -292,13 +265,12 @@ class TestStringMixIn(unittest.TestCase): self.assertTrue(str13.isnumeric()) self.assertTrue(str14.isnumeric()) - if py3k: - str16B = _FakeString("\x01\x02") - self.assertTrue(str9.isprintable()) - self.assertTrue(str13.isprintable()) - self.assertTrue(str14.isprintable()) - self.assertTrue(str15.isprintable()) - self.assertFalse(str16B.isprintable()) + str16B = _FakeString("\x01\x02") + self.assertTrue(str9.isprintable()) + self.assertTrue(str13.isprintable()) + self.assertTrue(str14.isprintable()) + self.assertTrue(str15.isprintable()) + self.assertFalse(str16B.isprintable()) str17 = _FakeString(" ") str18 = _FakeString("\t \t \r\n") @@ -329,10 +301,9 @@ class TestStringMixIn(unittest.TestCase): self.assertEqual("", str15.lower()) self.assertEqual("foobar", str16.lower()) self.assertEqual("ß", str22.lower()) - if py3k and not py32: - self.assertEqual("", str15.casefold()) - self.assertEqual("foobar", str16.casefold()) - self.assertEqual("ss", str22.casefold()) + self.assertEqual("", str15.casefold()) + self.assertEqual("foobar", str16.casefold()) + self.assertEqual("ss", str22.casefold()) str23 = _FakeString(" fake string ") self.assertEqual("fake string", str1.lstrip()) @@ -378,9 +349,8 @@ class TestStringMixIn(unittest.TestCase): self.assertEqual(actual, str25.rsplit(None, 3)) actual = [" this is a sentence with", "", "whitespace", ""] self.assertEqual(actual, str25.rsplit(" ", 3)) - if py3k and not py32: - actual = [" this is a", "sentence", "with", "whitespace"] - self.assertEqual(actual, str25.rsplit(maxsplit=3)) + actual = [" this is a", "sentence", "with", "whitespace"] + self.assertEqual(actual, str25.rsplit(maxsplit=3)) self.assertEqual("fake string", str1.rstrip()) self.assertEqual(" fake string", str23.rstrip()) @@ -396,9 +366,8 @@ class TestStringMixIn(unittest.TestCase): self.assertEqual(actual, str25.split(None, 3)) actual = ["", "", "", "this is a sentence with whitespace "] self.assertEqual(actual, str25.split(" ", 3)) - if py3k and not py32: - actual = ["this", "is", "a", "sentence with whitespace "] - self.assertEqual(actual, str25.split(maxsplit=3)) + actual = ["this", "is", "a", "sentence with whitespace "] + self.assertEqual(actual, str25.split(maxsplit=3)) str26 = _FakeString("lines\nof\ntext\r\nare\r\npresented\nhere") self.assertEqual(["lines", "of", "text", "are", "presented", "here"], @@ -417,17 +386,13 @@ class TestStringMixIn(unittest.TestCase): self.assertEqual("Fake String", str1.title()) - if py3k: - table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3", - 111: "4", 117: "5"}) - table2 = StringMixIn.maketrans("aeiou", "12345") - table3 = StringMixIn.maketrans("aeiou", "12345", "rts") - self.assertEqual("f1k2 str3ng", str1.translate(table1)) - self.assertEqual("f1k2 str3ng", str1.translate(table2)) - self.assertEqual("f1k2 3ng", str1.translate(table3)) - else: - table = {97: "1", 101: "2", 105: "3", 111: "4", 117: "5"} - self.assertEqual("f1k2 str3ng", str1.translate(table)) + table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3", + 111: "4", 117: "5"}) + table2 = StringMixIn.maketrans("aeiou", "12345") + table3 = StringMixIn.maketrans("aeiou", "12345", "rts") + self.assertEqual("f1k2 str3ng", str1.translate(table1)) + self.assertEqual("f1k2 str3ng", str1.translate(table2)) + self.assertEqual("f1k2 3ng", str1.translate(table3)) self.assertEqual("", str15.upper()) self.assertEqual("FOOBAR", str16.upper()) diff --git a/tests/test_tag.py b/tests/test_tag.py index 2e6d8a3..860a94b 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import Tag, Template, Text from mwparserfromhell.nodes.extras import Attribute from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext diff --git a/tests/test_template.py b/tests/test_template.py index 5b939f0..461371d 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2017 Ben Kurtovic # @@ -20,15 +19,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from difflib import unified_diff +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import HTMLEntity, Template, Text from mwparserfromhell.nodes.extras import Parameter from mwparserfromhell import parse diff --git a/tests/test_text.py b/tests/test_text.py index aaf8db2..94da937 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import Text class TestText(unittest.TestCase): diff --git a/tests/test_tokens.py b/tests/test_tokens.py index b33c2f1..6ce28b5 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import py3k from mwparserfromhell.parser import tokens class TestTokens(unittest.TestCase): @@ -68,14 +61,9 @@ class TestTokens(unittest.TestCase): hundredchars = ("earwig" * 100)[:97] + "..." self.assertEqual("Token()", repr(token1)) - if py3k: - token2repr1 = "Token(foo='bar', baz=123)" - token2repr2 = "Token(baz=123, foo='bar')" - token3repr = "Text(text='" + hundredchars + "')" - else: - token2repr1 = "Token(foo=u'bar', baz=123)" - token2repr2 = "Token(baz=123, foo=u'bar')" - token3repr = "Text(text=u'" + hundredchars + "')" + token2repr1 = "Token(foo='bar', baz=123)" + token2repr2 = "Token(baz=123, foo='bar')" + token3repr = "Text(text='" + hundredchars + "')" token2repr = repr(token2) self.assertTrue(token2repr == token2repr1 or token2repr == token2repr2) self.assertEqual(token3repr, repr(token3)) diff --git a/tests/test_utils.py b/tests/test_utils.py index 342cfd7..b8572fd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,12 +19,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals - -try: - import unittest2 as unittest -except ImportError: - import unittest +import unittest from mwparserfromhell.nodes import Template, Text from mwparserfromhell.utils import parse_anything diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index c77fdd2..9701865 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,17 +19,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals from functools import partial import re from types import GeneratorType +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import py3k, str from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, Node, Tag, Template, Text, Wikilink) from mwparserfromhell.smart_list import SmartList @@ -192,8 +185,8 @@ class TestWikicode(TreeEqualityTestCase): self.assertRaises(ValueError, func, fake, "q", recursive=True) func("{{b}}{{c}}", "w", recursive=False) func("{{d}}{{e}}", "x", recursive=True) - func(wrap(code4.nodes[-2:]), "y", recursive=False) - func(wrap(code4.nodes[-2:]), "z", recursive=True) + func(Wikicode(code4.nodes[-2:]), "y", recursive=False) + func(Wikicode(code4.nodes[-2:]), "z", recursive=True) self.assertEqual(expected[3], code4) self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=False) self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=True) @@ -222,6 +215,20 @@ class TestWikicode(TreeEqualityTestCase): func("{{foo}}{{baz}}", "{{lol}}") self.assertEqual(expected[6], code7) + code8 = parse("== header ==") + func = partial(meth, code8) + sec1, sec2 = code8.get_sections(include_headings=False) + func(sec1, "lead\n") + func(sec2, "\nbody") + self.assertEqual(expected[7], code8) + + code9 = parse("{{foo}}") + meth(code9.get_sections()[0], code9.get_sections()[0], "{{bar}}") + meth(code9.get_sections()[0], code9, "{{baz}}") + meth(code9, code9, "{{qux}}") + meth(code9, code9.get_sections()[0], "{{quz}}") + self.assertEqual(expected[8], code9) + def test_insert_before(self): """test Wikicode.insert_before()""" meth = lambda code, *args, **kw: code.insert_before(*args, **kw) @@ -232,7 +239,10 @@ class TestWikicode(TreeEqualityTestCase): "{{a}}w{{b}}{{c}}x{{d}}{{e}}{{f}}{{g}}{{h}}yz{{i}}{{j}}", "{{a|x{{b}}{{c}}|{{f|{{g}}=y{{h}}{{i}}}}}}", "here cdis {{some abtext and a {{template}}}}", - "{{foo}}{{bar}}{{baz}}{{lol}}{{foo}}{{baz}}"] + "{{foo}}{{bar}}{{baz}}{{lol}}{{foo}}{{baz}}", + "lead\n== header ==\nbody", + "{{quz}}{{qux}}{{baz}}{{bar}}{{foo}}", + ] self._test_search(meth, expected) def test_insert_after(self): @@ -245,16 +255,26 @@ class TestWikicode(TreeEqualityTestCase): "{{a}}{{b}}{{c}}w{{d}}{{e}}x{{f}}{{g}}{{h}}{{i}}{{j}}yz", "{{a|{{b}}{{c}}x|{{f|{{g}}={{h}}{{i}}y}}}}", "here is {{somecd text andab a {{template}}}}", - "{{foo}}{{bar}}{{baz}}{{foo}}{{baz}}{{lol}}"] + "{{foo}}{{bar}}{{baz}}{{foo}}{{baz}}{{lol}}", + "lead\n== header ==\nbody", + "{{foo}}{{bar}}{{baz}}{{qux}}{{quz}}", + ] self._test_search(meth, expected) def test_replace(self): """test Wikicode.replace()""" meth = lambda code, *args, **kw: code.replace(*args, **kw) expected = [ - "{{a}}xz[[y]]{{e}}", "dcdffe", "{{a|x|{{c|d=y}}}}", - "{{a}}wx{{f}}{{g}}z", "{{a|x|{{f|{{g}}=y}}}}", - "here cd ab a {{template}}}}", "{{foo}}{{bar}}{{baz}}{{lol}}"] + "{{a}}xz[[y]]{{e}}", + "dcdffe", + "{{a|x|{{c|d=y}}}}", + "{{a}}wx{{f}}{{g}}z", + "{{a|x|{{f|{{g}}=y}}}}", + "here cd ab a {{template}}}}", + "{{foo}}{{bar}}{{baz}}{{lol}}", + "lead\n== header ==\nbody", + "{{quz}}", + ] self._test_search(meth, expected) def test_append(self): @@ -273,16 +293,25 @@ class TestWikicode(TreeEqualityTestCase): """test Wikicode.remove()""" meth = lambda code, obj, value, **kw: code.remove(obj, **kw) expected = [ - "{{a}}{{c}}", "", "{{a||{{c|d=}}}}", "{{a}}{{f}}", - "{{a||{{f|{{g}}=}}}}", "here a {{template}}}}", - "{{foo}}{{bar}}{{baz}}"] + "{{a}}{{c}}", + "", + "{{a||{{c|d=}}}}", + "{{a}}{{f}}", + "{{a||{{f|{{g}}=}}}}", + "here a {{template}}}}", + "{{foo}}{{bar}}{{baz}}", + "== header ==", + "", + ] self._test_search(meth, expected) def test_matches(self): """test Wikicode.matches()""" code1 = parse("Cleanup") code2 = parse("\nstub") - code3 = parse("") + code3 = parse("Hello world!") + code4 = parse("World,_hello?") + code5 = parse("") self.assertTrue(code1.matches("Cleanup")) self.assertTrue(code1.matches("cleanup")) self.assertTrue(code1.matches(" cleanup\n")) @@ -297,9 +326,15 @@ class TestWikicode(TreeEqualityTestCase): self.assertFalse(code2.matches(["StuB", "sTUb", "foobar"])) self.assertTrue(code2.matches(("StuB", "sTUb", "foo", "bar", "Stub"))) self.assertTrue(code2.matches(["StuB", "sTUb", "foo", "bar", "Stub"])) - self.assertTrue(code3.matches("")) - self.assertTrue(code3.matches("")) - self.assertTrue(code3.matches(("a", "b", ""))) + self.assertTrue(code3.matches("hello world!")) + self.assertTrue(code3.matches("hello_world!")) + self.assertFalse(code3.matches("hello__world!")) + self.assertTrue(code4.matches("World,_hello?")) + self.assertTrue(code4.matches("World, hello?")) + self.assertFalse(code4.matches("World, hello?")) + self.assertTrue(code5.matches("")) + self.assertTrue(code5.matches("")) + self.assertTrue(code5.matches(("a", "b", ""))) def test_filter_family(self): """test the Wikicode.i?filter() family of functions""" diff --git a/tests/test_wikilink.py b/tests/test_wikilink.py index 80116ca..1865b6e 100644 --- a/tests/test_wikilink.py +++ b/tests/test_wikilink.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (C) 2012-2016 Ben Kurtovic # @@ -20,14 +19,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +import unittest -try: - import unittest2 as unittest -except ImportError: - import unittest - -from mwparserfromhell.compat import str from mwparserfromhell.nodes import Text, Wikilink from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 7137c50..7ab51c6 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -353,3 +353,10 @@ name: many_invalid_nested_tags label: many unending nested tags that should be treated as plain text, followed by valid wikitext (see issues #42, #183) input: "[[{{x}}" output: [Text(text="[["), TemplateOpen(), Text(text="x"), TemplateClose()] + +--- + +name: nested_templates_and_style_tags +label: many nested templates and style tags, testing edge case behavior and error recovery near the recursion depth limit (see issue #224) +input: "{{a|'''}}{{b|1='''c''}}{{d|1='''e''}}{{f|1='''g''}}{{h|1='''i''}}{{j|1='''k''}}{{l|1='''m''}}{{n|1='''o''}}{{p|1='''q''}}{{r|1=''s'''}}{{t|1='''u''}}{{v|1='''w''x'''y'''}}\n{|\n|-\n|'''\n|}" +output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="'''"), TemplateClose(), TemplateOpen(), Text(text="b"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="c"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="d"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="e"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="f"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="g"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="h"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="i"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="j"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="k"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="m"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="n"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="o''}}"), TemplateOpen(), Text(text="p"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="q"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="r"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="''s'''"), TemplateClose(), TemplateOpen(), Text(text="t"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="u"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), Text(text="{{v|1="), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="w''x"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="y"), TagOpenClose(), Text(text="b"), TagCloseClose(), TemplateClose(), Text(text="\n"), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="'''\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 16012cf..b8e92cf 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -408,3 +408,17 @@ name: junk_after_table_row label: ignore junk on the first line of a table row input: "{|\n|- foo="bar" | baz\n|blerp\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="bar"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="baz"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="blerp\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: incomplete_nested_open_only +label: many nested incomplete tables: table open only +input: "{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|" +output: [Text(text="{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|\n{|")] + +--- + +name: incomplete_nested_open_and_row +label: many nested incomplete tables: table open and row separator (see issue #206) +input: "{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-" +output: [Text(text="{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-\n{|\n|-")]