Browse Source

Drop Python 2 support

Fixes #221.
pull/234/head
Kunal Mehta 8 months ago
parent
commit
7e5297fbe6
71 changed files with 189 additions and 657 deletions
  1. +0
    -1
      .travis.yml
  2. +2
    -4
      README.rst
  3. +0
    -8
      appveyor.yml
  4. +1
    -1
      docs/index.rst
  5. +1
    -3
      docs/usage.rst
  6. +1
    -2
      mwparserfromhell/__init__.py
  7. +0
    -27
      mwparserfromhell/compat.py
  8. +0
    -2
      mwparserfromhell/definitions.py
  9. +0
    -3
      mwparserfromhell/nodes/__init__.py
  10. +1
    -4
      mwparserfromhell/nodes/argument.py
  11. +1
    -4
      mwparserfromhell/nodes/comment.py
  12. +1
    -4
      mwparserfromhell/nodes/external_link.py
  13. +0
    -1
      mwparserfromhell/nodes/extras/__init__.py
  14. +1
    -4
      mwparserfromhell/nodes/extras/attribute.py
  15. +1
    -4
      mwparserfromhell/nodes/extras/parameter.py
  16. +1
    -4
      mwparserfromhell/nodes/heading.py
  17. +5
    -34
      mwparserfromhell/nodes/html_entity.py
  18. +1
    -4
      mwparserfromhell/nodes/tag.py
  19. +1
    -4
      mwparserfromhell/nodes/template.py
  20. +1
    -4
      mwparserfromhell/nodes/text.py
  21. +1
    -4
      mwparserfromhell/nodes/wikilink.py
  22. +2
    -3
      mwparserfromhell/parser/__init__.py
  23. +1
    -4
      mwparserfromhell/parser/builder.py
  24. +0
    -1
      mwparserfromhell/parser/contexts.py
  25. +1
    -25
      mwparserfromhell/parser/ctokenizer/common.h
  26. +1
    -1
      mwparserfromhell/parser/ctokenizer/tag_data.h
  27. +6
    -49
      mwparserfromhell/parser/ctokenizer/textbuffer.c
  28. +2
    -2
      mwparserfromhell/parser/ctokenizer/textbuffer.h
  29. +26
    -30
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  30. +1
    -1
      mwparserfromhell/parser/ctokenizer/tok_parse.h
  31. +4
    -8
      mwparserfromhell/parser/ctokenizer/tok_support.c
  32. +3
    -3
      mwparserfromhell/parser/ctokenizer/tok_support.h
  33. +9
    -25
      mwparserfromhell/parser/ctokenizer/tokenizer.c
  34. +0
    -18
      mwparserfromhell/parser/ctokenizer/tokenizer.h
  35. +4
    -6
      mwparserfromhell/parser/tokenizer.py
  36. +1
    -5
      mwparserfromhell/parser/tokens.py
  37. +13
    -43
      mwparserfromhell/smart_list/ListProxy.py
  38. +19
    -47
      mwparserfromhell/smart_list/SmartList.py
  39. +0
    -1
      mwparserfromhell/smart_list/__init__.py
  40. +1
    -5
      mwparserfromhell/smart_list/utils.py
  41. +11
    -25
      mwparserfromhell/string_mixin.py
  42. +0
    -3
      mwparserfromhell/utils.py
  43. +3
    -8
      mwparserfromhell/wikicode.py
  44. +0
    -3
      scripts/memtest.py
  45. +3
    -9
      setup.py
  46. +0
    -1
      tests/__init__.py
  47. +1
    -6
      tests/_test_tokenizer.py
  48. +0
    -3
      tests/_test_tree_equality.py
  49. +0
    -18
      tests/compat.py
  50. +0
    -3
      tests/test_argument.py
  51. +0
    -3
      tests/test_attribute.py
  52. +1
    -5
      tests/test_builder.py
  53. +0
    -3
      tests/test_comment.py
  54. +0
    -2
      tests/test_ctokenizer.py
  55. +11
    -29
      tests/test_docs.py
  56. +0
    -3
      tests/test_external_link.py
  57. +0
    -3
      tests/test_heading.py
  58. +0
    -3
      tests/test_html_entity.py
  59. +0
    -3
      tests/test_parameter.py
  60. +0
    -3
      tests/test_parser.py
  61. +0
    -2
      tests/test_pytokenizer.py
  62. +0
    -2
      tests/test_roundtripping.py
  63. +3
    -18
      tests/test_smart_list.py
  64. +39
    -68
      tests/test_string_mixin.py
  65. +0
    -3
      tests/test_tag.py
  66. +0
    -3
      tests/test_template.py
  67. +0
    -3
      tests/test_text.py
  68. +3
    -11
      tests/test_tokens.py
  69. +0
    -2
      tests/test_utils.py
  70. +0
    -3
      tests/test_wikicode.py
  71. +0
    -3
      tests/test_wikilink.py

+ 0
- 1
.travis.yml View File

@@ -1,7 +1,6 @@
dist: xenial
language: python
python:
- 2.7
- 3.4
- 3.5
- 3.6

+ 2
- 4
README.rst View File

@@ -11,7 +11,7 @@ mwparserfromhell

**mwparserfromhell** (the *MediaWiki Parser from Hell*) is a Python package
that provides an easy-to-use and outrageously powerful parser for MediaWiki_
wikicode. It supports Python 2 and Python 3.
wikicode. It supports Python 3.4+.

Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others.
Full documentation is available on ReadTheDocs_. Development occurs on GitHub_.
@@ -41,7 +41,7 @@ Normal usage is rather straightforward (where ``text`` is page text):
>>> wikicode = mwparserfromhell.parse(text)

``wikicode`` is a ``mwparserfromhell.Wikicode`` object, which acts like an
ordinary ``str`` object (or ``unicode`` in Python 2) with some extra methods.
ordinary ``str`` object with some extra methods.
For example:

>>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?"
@@ -111,8 +111,6 @@ saving the page!) by calling ``str()`` on it:
>>> text == code
True

Likewise, use ``unicode(code)`` in Python 2.

Limitations
-----------


+ 0
- 8
appveyor.yml View File

@@ -22,14 +22,6 @@ environment:
secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+

matrix:
- PYTHON: "C:\\Python27"
PYTHON_VERSION: "2.7"
PYTHON_ARCH: "32"

- PYTHON: "C:\\Python27-x64"
PYTHON_VERSION: "2.7"
PYTHON_ARCH: "64"

- PYTHON: "C:\\Python34"
PYTHON_VERSION: "3.4"
PYTHON_ARCH: "32"

+ 1
- 1
docs/index.rst View File

@@ -3,7 +3,7 @@ MWParserFromHell v\ |version| Documentation

:mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python package
that provides an easy-to-use and outrageously powerful parser for MediaWiki_
wikicode. It supports Python 2 and Python 3.
wikicode. It supports Python 3.4+.

Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others.
Development occurs on GitHub_.

+ 1
- 3
docs/usage.rst View File

@@ -7,8 +7,7 @@ Normal usage is rather straightforward (where ``text`` is page text)::
>>> wikicode = mwparserfromhell.parse(text)

``wikicode`` is a :class:`mwparserfromhell.Wikicode <.Wikicode>` object, which
acts like an ordinary ``str`` object (or ``unicode`` in Python 2) with some
extra methods. For example::
acts like an ordinary ``str`` object with some extra methods. For example::

>>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?"
>>> wikicode = mwparserfromhell.parse(text)
@@ -78,7 +77,6 @@ saving the page!) by calling :func:`str` on it::
>>> text == code
True

(Likewise, use :func:`unicode(code) <unicode>` in Python 2.)

For more tips, check out :class:`Wikicode's full method list <.Wikicode>` and
the :mod:`list of Nodes <.nodes>`.

+ 1
- 2
mwparserfromhell/__init__.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -32,7 +31,7 @@ __license__ = "MIT License"
__version__ = "0.6.dev0"
__email__ = "ben.kurtovic@gmail.com"

from . import (compat, definitions, nodes, parser, smart_list, string_mixin,
from . import (definitions, nodes, parser, smart_list, string_mixin,
utils, wikicode)

parse = utils.parse_anything

+ 0
- 27
mwparserfromhell/compat.py View File

@@ -1,27 +0,0 @@
# -*- coding: utf-8 -*-

"""
Implements support for both Python 2 and Python 3 by defining common types in
terms of their Python 2/3 variants. For example, :class:`str` is set to
:class:`unicode` on Python 2 but :class:`str` on Python 3; likewise,
:class:`bytes` is :class:`str` on 2 but :class:`bytes` on 3. These types are
meant to be imported directly from within the parser's modules.
"""

import sys

py3k = (sys.version_info[0] == 3)

if py3k:
bytes = bytes
str = str
range = range
import html.entities as htmlentities

else:
bytes = str
str = unicode
range = xrange
import htmlentitydefs as htmlentities

del sys

+ 0
- 2
mwparserfromhell/definitions.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -28,7 +27,6 @@ When updating this file, please also update the the C tokenizer version:
- mwparserfromhell/parser/ctokenizer/definitions.h
"""

from __future__ import unicode_literals

__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only", "is_scheme"]

+ 0
- 3
mwparserfromhell/nodes/__init__.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -29,9 +28,7 @@ the name of a :class:`.Template` is a :class:`.Wikicode` object that can
contain text or more templates.
"""

from __future__ import unicode_literals

from ..compat import str
from ..string_mixin import StringMixIn

__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading",

+ 1
- 4
mwparserfromhell/nodes/argument.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node
from ..compat import str
from ..utils import parse_anything

__all__ = ["Argument"]
@@ -32,7 +29,7 @@ class Argument(Node):
"""Represents a template argument substitution, like ``{{{foo}}}``."""

def __init__(self, name, default=None):
super(Argument, self).__init__()
super().__init__()
self.name = name
self.default = default


+ 1
- 4
mwparserfromhell/nodes/comment.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node
from ..compat import str

__all__ = ["Comment"]

@@ -31,7 +28,7 @@ class Comment(Node):
"""Represents a hidden HTML comment, like ``<!-- foobar -->``."""

def __init__(self, contents):
super(Comment, self).__init__()
super().__init__()
self.contents = contents

def __unicode__(self):

+ 1
- 4
mwparserfromhell/nodes/external_link.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node
from ..compat import str
from ..utils import parse_anything

__all__ = ["ExternalLink"]
@@ -32,7 +29,7 @@ class ExternalLink(Node):
"""Represents an external link, like ``[http://example.com/ Example]``."""

def __init__(self, url, title=None, brackets=True):
super(ExternalLink, self).__init__()
super().__init__()
self.url = url
self.title = title
self.brackets = brackets

+ 0
- 1
mwparserfromhell/nodes/extras/__init__.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#

+ 1
- 4
mwparserfromhell/nodes/extras/attribute.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,9 +19,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from ...compat import str
from ...string_mixin import StringMixIn
from ...utils import parse_anything

@@ -38,7 +35,7 @@ class Attribute(StringMixIn):

def __init__(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
super(Attribute, self).__init__()
super().__init__()
self.name = name
self._quotes = None
self.value = value

+ 1
- 4
mwparserfromhell/nodes/extras/parameter.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import re

from ...compat import str
from ...string_mixin import StringMixIn
from ...utils import parse_anything

@@ -39,7 +36,7 @@ class Parameter(StringMixIn):
"""

def __init__(self, name, value, showkey=True):
super(Parameter, self).__init__()
super().__init__()
self.name = name
self.value = value
self.showkey = showkey

+ 1
- 4
mwparserfromhell/nodes/heading.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node
from ..compat import str
from ..utils import parse_anything

__all__ = ["Heading"]
@@ -32,7 +29,7 @@ class Heading(Node):
"""Represents a section heading in wikicode, like ``== Foo ==``."""

def __init__(self, title, level):
super(Heading, self).__init__()
super().__init__()
self.title = title
self.level = level


+ 5
- 34
mwparserfromhell/nodes/html_entity.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import html.entities as htmlentities

from . import Node
from ..compat import htmlentities, py3k, str

__all__ = ["HTMLEntity"]

@@ -31,7 +29,7 @@ class HTMLEntity(Node):
"""Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""

def __init__(self, value, named=None, hexadecimal=False, hex_char="x"):
super(HTMLEntity, self).__init__()
super().__init__()
self._value = value
if named is None: # Try to guess whether or not the entity is named
try:
@@ -63,32 +61,6 @@ class HTMLEntity(Node):
return self.normalize()
return self

if not py3k:
@staticmethod
def _unichr(value):
"""Implement builtin unichr() with support for non-BMP code points.

On wide Python builds, this functions like the normal unichr(). On
narrow builds, this returns the value's encoded surrogate pair.
"""
try:
return unichr(value)
except ValueError:
# Test whether we're on the wide or narrow Python build. Check
# the length of a non-BMP code point
# (U+1F64A, SPEAK-NO-EVIL MONKEY):
if len("\U0001F64A") == 1: # pragma: no cover
raise
# Ensure this is within the range we can encode:
if value > 0x10FFFF:
raise ValueError("unichr() arg not in range(0x110000)")
code = value - 0x10000
if value < 0: # Invalid code point
raise
lead = 0xD800 + (code >> 10)
trail = 0xDC00 + (code % (1 << 10))
return unichr(lead) + unichr(trail)

@property
def value(self):
"""The string value of the HTML entity."""
@@ -173,9 +145,8 @@ class HTMLEntity(Node):

def normalize(self):
"""Return the unicode character represented by the HTML entity."""
chrfunc = chr if py3k else HTMLEntity._unichr
if self.named:
return chrfunc(htmlentities.name2codepoint[self.value])
return chr(htmlentities.name2codepoint[self.value])
if self.hexadecimal:
return chrfunc(int(self.value, 16))
return chrfunc(int(self.value))
return chr(int(self.value, 16))
return chr(int(self.value))

+ 1
- 4
mwparserfromhell/nodes/tag.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,11 +19,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node
from .extras import Attribute
from ..compat import str
from ..definitions import is_visible
from ..utils import parse_anything

@@ -37,7 +34,7 @@ class Tag(Node):
self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None, wiki_style_separator=None,
closing_wiki_markup=None):
super(Tag, self).__init__()
super().__init__()
self.tag = tag
self.contents = contents
self._attrs = attrs if attrs else []

+ 1
- 4
mwparserfromhell/nodes/template.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,13 +19,11 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
from collections import defaultdict
import re

from . import HTMLEntity, Node, Text
from .extras import Parameter
from ..compat import range, str
from ..utils import parse_anything

__all__ = ["Template"]
@@ -37,7 +34,7 @@ class Template(Node):
"""Represents a template in wikicode, like ``{{foo}}``."""

def __init__(self, name, params=None):
super(Template, self).__init__()
super().__init__()
self.name = name
if params:
self._params = params

+ 1
- 4
mwparserfromhell/nodes/text.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node
from ..compat import str

__all__ = ["Text"]

@@ -31,7 +28,7 @@ class Text(Node):
"""Represents ordinary, unformatted text with no special properties."""

def __init__(self, value):
super(Text, self).__init__()
super().__init__()
self.value = value

def __unicode__(self):

+ 1
- 4
mwparserfromhell/nodes/wikilink.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import Node
from ..compat import str
from ..utils import parse_anything

__all__ = ["Wikilink"]
@@ -32,7 +29,7 @@ class Wikilink(Node):
"""Represents an internal wikilink, like ``[[Foo|Bar]]``."""

def __init__(self, title, text=None):
super(Wikilink, self).__init__()
super().__init__()
self.title = title
self.text = text


+ 2
- 3
mwparserfromhell/parser/__init__.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -36,7 +35,7 @@ class ParserError(Exception):
"""
def __init__(self, extra):
msg = "This is a bug and should be reported. Info: {}.".format(extra)
super(ParserError, self).__init__(msg)
super().__init__(msg)


from .builder import Builder
@@ -50,7 +49,7 @@ except ImportError:

__all__ = ["use_c", "Parser", "ParserError"]

class Parser(object):
class Parser:
"""Represents a parser for wikicode.

Actual parsing is a two-step process: first, the text is split up into a

+ 1
- 4
mwparserfromhell/parser/builder.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from . import tokens, ParserError
from ..compat import str
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
Template, Text, Wikilink)
from ..nodes.extras import Attribute, Parameter
@@ -45,7 +42,7 @@ def _add_handler(token_type):
return decorator


class Builder(object):
class Builder:
"""Builds a tree of nodes out of a sequence of tokens.

To use, pass a list of :class:`.Token`\\ s to the :meth:`build` method. The

+ 0
- 1
mwparserfromhell/parser/contexts.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#

+ 1
- 25
mwparserfromhell/parser/ctokenizer/common.h View File

@@ -23,7 +23,7 @@ SOFTWARE.
#pragma once

#ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html
#endif

#include <Python.h>
@@ -34,10 +34,6 @@ SOFTWARE.

/* Compatibility macros */

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif

#ifndef uint64_t
#define uint64_t unsigned PY_LONG_LONG
#endif
@@ -48,20 +44,8 @@ SOFTWARE.

/* Unicode support macros */

#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3
#define PEP_393
#endif

#ifdef PEP_393
#define Unicode Py_UCS4
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)
#else
#define Unicode Py_UNICODE
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromUnicode(&(chr), 1)
#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
#endif

/* Error handling macros */

@@ -85,13 +69,9 @@ extern PyObject* definitions;
typedef struct {
Py_ssize_t capacity;
Py_ssize_t length;
#ifdef PEP_393
PyObject* object;
int kind;
void* data;
#else
Py_UNICODE* data;
#endif
} Textbuffer;

typedef struct {
@@ -111,12 +91,8 @@ typedef struct Stack Stack;
typedef struct {
PyObject* object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */
#ifdef PEP_393
int kind; /* object's kind value */
void* data; /* object's raw unicode buffer */
#else
Py_UNICODE* buf; /* object's internal buffer */
#endif
} TokenizerInput;

typedef struct avl_tree_node avl_tree;

+ 1
- 1
mwparserfromhell/parser/ctokenizer/tag_data.h View File

@@ -32,7 +32,7 @@ typedef struct {
Textbuffer* pad_first;
Textbuffer* pad_before_eq;
Textbuffer* pad_after_eq;
Unicode quoter;
Py_UCS4 quoter;
Py_ssize_t reset;
} TagData;


+ 6
- 49
mwparserfromhell/parser/ctokenizer/textbuffer.c View File

@@ -29,23 +29,16 @@ SOFTWARE.
/*
Internal allocation function for textbuffers.
*/
static int internal_alloc(Textbuffer* self, Unicode maxchar)
static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar)
{
self->capacity = INITIAL_CAPACITY;
self->length = 0;

#ifdef PEP_393
self->object = PyUnicode_New(self->capacity, maxchar);
if (!self->object)
return -1;
self->kind = PyUnicode_KIND(self->object);
self->data = PyUnicode_DATA(self->object);
#else
(void) maxchar; // Unused
self->data = malloc(sizeof(Unicode) * self->capacity);
if (!self->data)
return -1;
#endif

return 0;
}
@@ -55,11 +48,7 @@ static int internal_alloc(Textbuffer* self, Unicode maxchar)
*/
static void internal_dealloc(Textbuffer* self)
{
#ifdef PEP_393
Py_DECREF(self->object);
#else
free(self->data);
#endif
}

/*
@@ -67,7 +56,6 @@ static void internal_dealloc(Textbuffer* self)
*/
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
{
#ifdef PEP_393
PyObject *newobj;
void *newdata;

@@ -79,10 +67,6 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
Py_DECREF(self->object);
self->object = newobj;
self->data = newdata;
#else
if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap)))
return -1;
#endif

self->capacity = new_cap;
return 0;
@@ -94,11 +78,9 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
Textbuffer* Textbuffer_new(TokenizerInput* text)
{
Textbuffer* self = malloc(sizeof(Textbuffer));
Unicode maxchar = 0;
Py_UCS4 maxchar = 0;

#ifdef PEP_393
maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);
#endif

if (!self)
goto fail_nomem;
@@ -127,11 +109,9 @@ void Textbuffer_dealloc(Textbuffer* self)
*/
int Textbuffer_reset(Textbuffer* self)
{
Unicode maxchar = 0;
Py_UCS4 maxchar = 0;

#ifdef PEP_393
maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);
#endif

internal_dealloc(self);
if (internal_alloc(self, maxchar))
@@ -142,18 +122,14 @@ int Textbuffer_reset(Textbuffer* self)
/*
Write a Unicode codepoint to the given textbuffer.
*/
int Textbuffer_write(Textbuffer* self, Unicode code)
int Textbuffer_write(Textbuffer* self, Py_UCS4 code)
{
if (self->length >= self->capacity) {
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
return -1;
}

#ifdef PEP_393
PyUnicode_WRITE(self->kind, self->data, self->length++, code);
#else
self->data[self->length++] = code;
#endif

return 0;
}
@@ -163,13 +139,9 @@ int Textbuffer_write(Textbuffer* self, Unicode code)

This function does not check for bounds.
*/
Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
Py_UCS4 Textbuffer_read(Textbuffer* self, Py_ssize_t index)
{
#ifdef PEP_393
return PyUnicode_READ(self->kind, self->data, index);
#else
return self->data[index];
#endif
}

/*
@@ -177,11 +149,7 @@ Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
*/
PyObject* Textbuffer_render(Textbuffer* self)
{
#ifdef PEP_393
return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
#else
return PyUnicode_FromUnicode(self->data, self->length);
#endif
}

/*
@@ -196,14 +164,9 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
return -1;
}

#ifdef PEP_393
assert(self->kind == other->kind);
memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
other->length * other->kind);
#else
memcpy(self->data + self->length, other->data,
other->length * sizeof(Unicode));
#endif

self->length = newlen;
return 0;
@@ -215,18 +178,12 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
void Textbuffer_reverse(Textbuffer* self)
{
Py_ssize_t i, end = self->length - 1;
Unicode tmp;
Py_UCS4 tmp;

for (i = 0; i < self->length / 2; i++) {
#ifdef PEP_393
tmp = PyUnicode_READ(self->kind, self->data, i);
PyUnicode_WRITE(self->kind, self->data, i,
PyUnicode_READ(self->kind, self->data, end - i));
PyUnicode_WRITE(self->kind, self->data, end - i, tmp);
#else
tmp = self->data[i];
self->data[i] = self->data[end - i];
self->data[end - i] = tmp;
#endif
}
}

+ 2
- 2
mwparserfromhell/parser/ctokenizer/textbuffer.h View File

@@ -29,8 +29,8 @@ SOFTWARE.
Textbuffer* Textbuffer_new(TokenizerInput*);
void Textbuffer_dealloc(Textbuffer*);
int Textbuffer_reset(Textbuffer*);
int Textbuffer_write(Textbuffer*, Unicode);
Unicode Textbuffer_read(Textbuffer*, Py_ssize_t);
int Textbuffer_write(Textbuffer*, Py_UCS4);
Py_UCS4 Textbuffer_read(Textbuffer*, Py_ssize_t);
PyObject* Textbuffer_render(Textbuffer*);
int Textbuffer_concat(Textbuffer*, Textbuffer*);
void Textbuffer_reverse(Textbuffer*);

+ 26
- 30
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -52,7 +52,7 @@ static int Tokenizer_parse_tag(Tokenizer*);
/*
Determine whether the given code point is a marker.
*/
static int is_marker(Unicode this)
static int is_marker(Py_UCS4 this)
{
int i;

@@ -442,7 +442,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
static const char* valid = URISCHEME;
Textbuffer* buffer;
PyObject* scheme;
Unicode this;
Py_UCS4 this;
int slashes, i;

if (Tokenizer_check_route(self, LC_EXT_LINK_URI) < 0)
@@ -463,7 +463,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
while (1) {
if (!valid[i])
goto end_of_loop;
if (this == (Unicode) valid[i])
if (this == (Py_UCS4) valid[i])
break;
i++;
}
@@ -516,7 +516,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
static const char* valid = URISCHEME;
Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
PyObject *scheme;
Unicode chunk;
Py_UCS4 chunk;
Py_ssize_t i;
int slashes, j;
uint64_t new_context;
@@ -536,7 +536,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
FAIL_ROUTE(0);
return 0;
}
} while (chunk != (Unicode) valid[j++]);
} while (chunk != (Py_UCS4) valid[j++]);
Textbuffer_write(scheme_buffer, chunk);
}
end_of_loop:
@@ -580,7 +580,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
Handle text in a free external link, including trailing punctuation.
*/
static int Tokenizer_handle_free_link_text(
Tokenizer* self, int* parens, Textbuffer* tail, Unicode this)
Tokenizer* self, int* parens, Textbuffer* tail, Py_UCS4 this)
{
#define PUSH_TAIL_BUFFER(tail, error) \
if (tail && tail->length > 0) { \
@@ -607,10 +607,10 @@ static int Tokenizer_handle_free_link_text(
Return whether the current head is the end of a free link.
*/
static int
Tokenizer_is_free_link(Tokenizer* self, Unicode this, Unicode next)
Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
{
// Built from Tokenizer_parse()'s end sentinels:
Unicode after = Tokenizer_read(self, 2);
Py_UCS4 after = Tokenizer_read(self, 2);
uint64_t ctx = self->topstack->context;

return (!this || this == '\n' || this == '[' || this == ']' ||
@@ -628,7 +628,7 @@ static PyObject*
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
Textbuffer* extra)
{
Unicode this, next;
Py_UCS4 this, next;
int parens = 0;

if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) :
@@ -816,11 +816,7 @@ static int Tokenizer_parse_heading(Tokenizer* self)
if (!heading) {
return -1;
}
#ifdef IS_PY3K
level = PyLong_FromSsize_t(heading->level);
#else
level = PyInt_FromSsize_t(heading->level);
#endif
if (!level) {
Py_DECREF(heading->title);
free(heading);
@@ -933,7 +929,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
static int Tokenizer_really_parse_entity(Tokenizer* self)
{
PyObject *kwargs, *charobj, *textobj;
Unicode this;
Py_UCS4 this;
int numeric, hexadecimal, i, j, zeroes, test;
char *valid, *text, *buffer, *def;

@@ -1014,7 +1010,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
while (1) {
if (!valid[j])
FAIL_ROUTE_AND_EXIT()
if (this == (Unicode) valid[j])
if (this == (Py_UCS4) valid[j])
break;
j++;
}
@@ -1111,7 +1107,7 @@ static int Tokenizer_parse_comment(Tokenizer* self)
{
Py_ssize_t reset = self->head + 3;
PyObject *comment;
Unicode this;
Py_UCS4 this;

self->head += 4;
if (Tokenizer_push(self, 0))
@@ -1211,7 +1207,7 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
Handle whitespace inside of an HTML open tag.
*/
static int Tokenizer_handle_tag_space(
Tokenizer* self, TagData* data, Unicode text)
Tokenizer* self, TagData* data, Py_UCS4 text)
{
uint64_t ctx = data->context;
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
@@ -1243,9 +1239,9 @@ static int Tokenizer_handle_tag_space(
/*
Handle regular text inside of an HTML open tag.
*/
static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text)
static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UCS4 text)
{
Unicode next = Tokenizer_read(self, 1);
Py_UCS4 next = Tokenizer_read(self, 1);

if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self))
return Tokenizer_emit_char(self, text);
@@ -1262,7 +1258,7 @@ static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text)
Handle all sorts of text data inside of an HTML open tag.
*/
static int Tokenizer_handle_tag_data(
Tokenizer* self, TagData* data, Unicode chunk)
Tokenizer* self, TagData* data, Py_UCS4 chunk)
{
PyObject *trash;
int first_time, escaped;
@@ -1444,7 +1440,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
{
Textbuffer* buffer;
PyObject *buf_tmp, *end_tag, *start_tag;
Unicode this, next;
Py_UCS4 this, next;
Py_ssize_t reset;
int cmp;

@@ -1600,7 +1596,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
{
TagData *data = TagData_new(&self->text);
PyObject *token, *text, *trash;
Unicode this, next;
Py_UCS4 this, next;
int can_exit;

if (!data)
@@ -1686,7 +1682,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
Py_ssize_t reset = self->head + 1, pos = 0;
Textbuffer* buf;
PyObject *name, *tag;
Unicode this;
Py_UCS4 this;

self->head += 2;
buf = Textbuffer_new(&self->text);
@@ -1988,7 +1984,7 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self)
static int Tokenizer_handle_list_marker(Tokenizer* self)
{
PyObject *kwargs, *markup;
Unicode code = Tokenizer_read(self, 0);
Py_UCS4 code = Tokenizer_read(self, 0);

if (code == ';')
self->topstack->context |= LC_DLTERM;
@@ -2015,7 +2011,7 @@ static int Tokenizer_handle_list_marker(Tokenizer* self)
*/
static int Tokenizer_handle_list(Tokenizer* self)
{
Unicode marker = Tokenizer_read(self, 1);
Py_UCS4 marker = Tokenizer_read(self, 1);

if (Tokenizer_handle_list_marker(self))
return -1;
@@ -2169,11 +2165,11 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
/*
Handle style attributes for a table until an ending token.
*/
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token)
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Py_UCS4 end_token)
{
TagData *data = TagData_new(&self->text);
PyObject *padding, *trash;
Unicode this;
Py_UCS4 this;
int can_exit;

if (!data)
@@ -2483,7 +2479,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
everything is safe, or -1 if the route must be failed.
*/
static int
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data)
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UCS4 data)
{
if (context & LC_FAIL_NEXT)
return -1;
@@ -2568,7 +2564,7 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data)
static int Tokenizer_has_leading_whitespace(Tokenizer* self)
{
int offset = 1;
Unicode current_character;
Py_UCS4 current_character;
while (1) {
current_character = Tokenizer_read_backwards(self, offset);
if (!current_character || current_character == '\n')
@@ -2586,7 +2582,7 @@ static int Tokenizer_has_leading_whitespace(Tokenizer* self)
PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
{
uint64_t this_context;
Unicode this, next, next_next, last;
Py_UCS4 this, next, next_next, last;
PyObject* temp;

if (push) {

+ 1
- 1
mwparserfromhell/parser/ctokenizer/tok_parse.h View File

@@ -24,7 +24,7 @@ SOFTWARE.

#include "common.h"

static const Unicode MARKERS[] = {
static const Py_UCS4 MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '!', '\n', '\0'};


+ 4
- 8
mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -275,7 +275,7 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
/*
Write a Unicode codepoint to the current textbuffer.
*/
int Tokenizer_emit_char(Tokenizer* self, Unicode code)
int Tokenizer_emit_char(Tokenizer* self, Py_UCS4 code)
{
return Textbuffer_write(self->topstack->textbuffer, code);
}
@@ -389,19 +389,15 @@ int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
/*
Internal function to read the codepoint at the given index from the input.
*/
static Unicode read_codepoint(TokenizerInput* text, Py_ssize_t index)
static Py_UCS4 read_codepoint(TokenizerInput* text, Py_ssize_t index)
{
#ifdef PEP_393
return PyUnicode_READ(text->kind, text->data, index);
#else
return text->buf[index];
#endif
}

/*
Read the value at a relative point in the wikicode, forwards.
*/
Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
Py_UCS4 Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index = self->head + delta;

@@ -413,7 +409,7 @@ Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
/*
Read the value at a relative point in the wikicode, backwards.
*/
Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
Py_UCS4 Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index;


+ 3
- 3
mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -38,14 +38,14 @@ void Tokenizer_free_bad_route_tree(Tokenizer*);

int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
int Tokenizer_emit_char(Tokenizer*, Unicode);
int Tokenizer_emit_char(Tokenizer*, Py_UCS4);
int Tokenizer_emit_text(Tokenizer*, const char*);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
int Tokenizer_emit_all(Tokenizer*, PyObject*);
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);

Unicode Tokenizer_read(Tokenizer*, Py_ssize_t);
Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
Py_UCS4 Tokenizer_read(Tokenizer*, Py_ssize_t);
Py_UCS4 Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);

/* Macros */


+ 9
- 25
mwparserfromhell/parser/ctokenizer/tokenizer.c View File

@@ -85,12 +85,8 @@ static void init_tokenizer_text(TokenizerInput* text)
text->object = Py_None;
Py_INCREF(Py_None);
text->length = 0;
#ifdef PEP_393
text->kind = PyUnicode_WCHAR_KIND;
text->data = NULL;
#else
text->buf = NULL;
#endif
}

/*
@@ -119,14 +115,10 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
dealloc_tokenizer_text(text);
text->object = input;

#ifdef PEP_393
if (PyUnicode_READY(input) < 0)
return -1;
text->kind = PyUnicode_KIND(input);
text->data = PyUnicode_DATA(input);
#else
text->buf = PyUnicode_AS_UNICODE(input);
#endif
text->length = PyUnicode_GET_LENGTH(input);
return 0;
}
@@ -192,11 +184,9 @@ static int load_entities(void)
{
PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i;
#ifdef IS_PY3K
PyObject *string;
#endif

tempmod = PyImport_ImportModule(ENTITYDEFS_MODULE);
tempmod = PyImport_ImportModule("html.entities");
if (!tempmod)
return -1;
defmap = PyObject_GetAttrString(tempmod, "entitydefs");
@@ -212,14 +202,10 @@ static int load_entities(void)
if (!entitydefs)
return -1;
for (i = 0; i < numdefs; i++) {
#ifdef IS_PY3K
string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i));
if (!string)
return -1;
entitydefs[i] = PyBytes_AsString(string);
#else
entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i));
#endif
if (!entitydefs[i])
return -1;
}
@@ -233,7 +219,7 @@ static int load_tokens(void)
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("tokens");
*modname = PyUnicode_FromString("tokens");
char *name = "mwparserfromhell.parser";

if (!fromlist || !modname)
@@ -256,7 +242,7 @@ static int load_defs(void)
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("definitions");
*modname = PyUnicode_FromString("definitions");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
@@ -277,7 +263,7 @@ static int load_exceptions(void)
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = IMPORT_NAME_FUNC("parser");
*modname = PyUnicode_FromString("parser");
char *name = "mwparserfromhell";

if (!fromlist || !modname)
@@ -294,24 +280,22 @@ static int load_exceptions(void)
return 0;
}

PyMODINIT_FUNC INIT_FUNC_NAME(void)
PyMODINIT_FUNC PyInit__tokenizer(void)
{
PyObject *module;

TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0)
INIT_ERROR;
module = CREATE_MODULE;
return NULL;
module = PyModule_Create(&module_def);
if (!module)
INIT_ERROR;
return NULL;
Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
NOARGS = PyTuple_New(0);
if (!NOARGS || load_entities() || load_tokens() || load_defs())
INIT_ERROR;
#ifdef IS_PY3K
return NULL;
return module;
#endif
}

+ 0
- 18
mwparserfromhell/parser/ctokenizer/tokenizer.h View File

@@ -32,22 +32,6 @@ static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

/* Compatibility macros */

#ifdef IS_PY3K
#define IMPORT_NAME_FUNC PyUnicode_FromString
#define CREATE_MODULE PyModule_Create(&module_def);
#define ENTITYDEFS_MODULE "html.entities"
#define INIT_FUNC_NAME PyInit__tokenizer
#define INIT_ERROR return NULL
#else
#define IMPORT_NAME_FUNC PyBytes_FromString
#define CREATE_MODULE Py_InitModule("_tokenizer", NULL);
#define ENTITYDEFS_MODULE "htmlentitydefs"
#define INIT_FUNC_NAME init_tokenizer
#define INIT_ERROR return
#endif

/* Structs */

static PyMethodDef Tokenizer_methods[] = {
@@ -101,11 +85,9 @@ static PyTypeObject TokenizerType = {
Tokenizer_new, /* tp_new */
};

#ifdef IS_PY3K
static PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenizer",
"Creates a list of tokens from a string of wikicode.",
-1, NULL, NULL, NULL, NULL, NULL
};
#endif

+ 4
- 6
mwparserfromhell/parser/tokenizer.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,12 +19,11 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import html.entities as htmlentities
from math import log
import re

from . import contexts, tokens, ParserError
from ..compat import htmlentities, range
from ..definitions import (get_html_tag, is_parsable, is_single,
is_single_only, is_scheme)

@@ -35,11 +33,11 @@ class BadRoute(Exception):
"""Raised internally when the current tokenization route is invalid."""

def __init__(self, context=0):
super(BadRoute, self).__init__()
super().__init__()
self.context = context


class _TagOpenData(object):
class _TagOpenData:
"""Stores data about an HTML open tag, like ``<ref name="foo">``."""
CX_NAME = 1 << 0
CX_ATTR_READY = 1 << 1
@@ -57,7 +55,7 @@ class _TagOpenData(object):
self.reset = 0


class Tokenizer(object):
class Tokenizer:
"""Creates a list of tokens from a string of wikicode."""
USES_C = False
START = object()

+ 1
- 5
mwparserfromhell/parser/tokens.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -28,9 +27,6 @@ a syntactically valid form by the :class:`.Tokenizer`, and then converted into
the :class`.Wikicode` tree by the :class:`.Builder`.
"""

from __future__ import unicode_literals

from ..compat import py3k, str

__all__ = ["Token"]

@@ -65,7 +61,7 @@ class Token(dict):
def make(name):
"""Create a new Token class using ``type()`` and add it to ``__all__``."""
__all__.append(name)
return type(name if py3k else name.encode("utf8"), (Token,), {})
return type(name, (Token,), {})

Text = make("Text")


+ 13
- 43
mwparserfromhell/smart_list/ListProxy.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com>
@@ -24,7 +23,6 @@
# SmartList has to be a full import in order to avoid cyclical import errors
import mwparserfromhell.smart_list.SmartList
from .utils import _SliceNormalizerMixIn, inheritdoc
from ..compat import py3k


class _ListProxy(_SliceNormalizerMixIn, list):
@@ -36,7 +34,7 @@ class _ListProxy(_SliceNormalizerMixIn, list):
"""

def __init__(self, parent, sliceinfo):
super(_ListProxy, self).__init__()
super().__init__()
self._parent = parent
self._sliceinfo = sliceinfo

@@ -73,12 +71,8 @@ class _ListProxy(_SliceNormalizerMixIn, list):
return self._render() >= list(other)
return self._render() >= other

if py3k:
def __bool__(self):
return bool(self._render())
else:
def __nonzero__(self):
return bool(self._render())
def __bool__(self):
return bool(self._render())

def __len__(self):
return max((self._stop - self._start) // self._step, 0)
@@ -138,16 +132,6 @@ class _ListProxy(_SliceNormalizerMixIn, list):
def __contains__(self, item):
return item in self._render()

if not py3k:
def __getslice__(self, start, stop):
return self.__getitem__(slice(start, stop))

def __setslice__(self, start, stop, iterable):
self.__setitem__(slice(start, stop), iterable)

def __delslice__(self, start, stop):
self.__delitem__(slice(start, stop))

def __add__(self, other):
return mwparserfromhell.smart_list.SmartList(list(self) + other)

@@ -237,27 +221,13 @@ class _ListProxy(_SliceNormalizerMixIn, list):
item.reverse()
self._parent[self._start:self._stop:self._step] = item

if py3k:
@inheritdoc
def sort(self, key=None, reverse=None):
item = self._render()
kwargs = {}
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item
else:
@inheritdoc
def sort(self, cmp=None, key=None, reverse=None):
item = self._render()
kwargs = {}
if cmp is not None:
kwargs["cmp"] = cmp
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item
@inheritdoc
def sort(self, key=None, reverse=None):
item = self._render()
kwargs = {}
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item

+ 19
- 47
mwparserfromhell/smart_list/SmartList.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com>
#
@@ -25,7 +23,6 @@ from _weakref import ref

from .ListProxy import _ListProxy
from .utils import _SliceNormalizerMixIn, inheritdoc
from ..compat import py3k


class SmartList(_SliceNormalizerMixIn, list):
@@ -54,14 +51,14 @@ class SmartList(_SliceNormalizerMixIn, list):

def __init__(self, iterable=None):
if iterable:
super(SmartList, self).__init__(iterable)
super().__init__(iterable)
else:
super(SmartList, self).__init__()
super().__init__()
self._children = {}

def __getitem__(self, key):
if not isinstance(key, slice):
return super(SmartList, self).__getitem__(key)
return super().__getitem__(key)
key = self._normalize_slice(key, clamp=False)
sliceinfo = [key.start, key.stop, key.step]
child = _ListProxy(self, sliceinfo)
@@ -71,44 +68,32 @@ class SmartList(_SliceNormalizerMixIn, list):

def __setitem__(self, key, item):
if not isinstance(key, slice):
return super(SmartList, self).__setitem__(key, item)
return super().__setitem__(key, item)
item = list(item)
super(SmartList, self).__setitem__(key, item)
super().__setitem__(key, item)
key = self._normalize_slice(key, clamp=True)
diff = len(item) + (key.start - key.stop) // key.step
if not diff:
return
values = self._children.values if py3k else self._children.itervalues
for child, (start, stop, step) in values():
for child, (start, stop, step) in self._children.values():
if start > key.stop:
self._children[id(child)][1][0] += diff
if stop is not None and stop >= key.stop:
self._children[id(child)][1][1] += diff

def __delitem__(self, key):
super(SmartList, self).__delitem__(key)
super().__delitem__(key)
if isinstance(key, slice):
key = self._normalize_slice(key, clamp=True)
else:
key = slice(key, key + 1, 1)
diff = (key.stop - key.start) // key.step
values = self._children.values if py3k else self._children.itervalues
for child, (start, stop, step) in values():
for child, (start, stop, step) in self._children.values():
if start > key.start:
self._children[id(child)][1][0] -= diff
if stop is not None and stop >= key.stop:
self._children[id(child)][1][1] -= diff

if not py3k:
def __getslice__(self, start, stop):
return self.__getitem__(slice(start, stop))

def __setslice__(self, start, stop, iterable):
self.__setitem__(slice(start, stop), iterable)

def __delslice__(self, start, stop):
self.__delitem__(slice(start, stop))

def __add__(self, other):
return SmartList(list(self) + other)

@@ -159,27 +144,14 @@ class SmartList(_SliceNormalizerMixIn, list):
@inheritdoc
def reverse(self):
self._detach_children()
super(SmartList, self).reverse()

if py3k:
@inheritdoc
def sort(self, key=None, reverse=None):
self._detach_children()
kwargs = {}
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
super(SmartList, self).sort(**kwargs)
else:
@inheritdoc
def sort(self, cmp=None, key=None, reverse=None):
self._detach_children()
kwargs = {}
if cmp is not None:
kwargs["cmp"] = cmp
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
super(SmartList, self).sort(**kwargs)
super().reverse()

@inheritdoc
def sort(self, key=None, reverse=None):
self._detach_children()
kwargs = {}
if key is not None:
kwargs["key"] = key
if reverse is not None:
kwargs["reverse"] = reverse
super().sort(**kwargs)

+ 0
- 1
mwparserfromhell/smart_list/__init__.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com>

+ 1
- 5
mwparserfromhell/smart_list/utils.py View File

@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com>
#
@@ -21,8 +19,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

from sys import maxsize

__all__ = []
@@ -38,7 +34,7 @@ def inheritdoc(method):
return method


class _SliceNormalizerMixIn(object):
class _SliceNormalizerMixIn:
"""MixIn that provides a private method to normalize slices."""

def _normalize_slice(self, key, clamp=False):

+ 11
- 25
mwparserfromhell/string_mixin.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -22,14 +21,11 @@

"""
This module contains the :class:`.StringMixIn` type, which implements the
interface for the ``unicode`` type (``str`` on py3k) in a dynamic manner.
interface for the ``str`` type in a dynamic manner.
"""

from __future__ import unicode_literals
from sys import getdefaultencoding

from .compat import bytes, py3k, str

__all__ = ["StringMixIn"]

def inheritdoc(method):
@@ -41,24 +37,20 @@ def inheritdoc(method):
method.__doc__ = getattr(str, method.__name__).__doc__
return method

class StringMixIn(object):
class StringMixIn:
"""Implement the interface for ``unicode``/``str`` in a dynamic manner.

To use this class, inherit from it and override the :meth:`__unicode__`
method (same on py3k) to return the string representation of the object.
method to return the string representation of the object.
The various string methods will operate on the value of :meth:`__unicode__`
instead of the immutable ``self`` like the regular ``str`` type.
"""

if py3k:
def __str__(self):
return self.__unicode__()
def __str__(self):
return self.__unicode__()

def __bytes__(self):
return bytes(self.__unicode__(), getdefaultencoding())
else:
def __str__(self):
return bytes(self.__unicode__())
def __bytes__(self):
return bytes(self.__unicode__(), getdefaultencoding())

def __unicode__(self):
raise NotImplementedError()
@@ -84,19 +76,14 @@ class StringMixIn(object):
def __ge__(self, other):
return self.__unicode__() >= other

if py3k:
def __bool__(self):
return bool(self.__unicode__())
else:
def __nonzero__(self):
return bool(self.__unicode__())
def __bool__(self):
return bool(self.__unicode__())

def __len__(self):
return len(self.__unicode__())

def __iter__(self):
for char in self.__unicode__():
yield char
yield from self.__unicode__()

def __getitem__(self, key):
return self.__unicode__()[key]
@@ -113,8 +100,7 @@ class StringMixIn(object):
type(self).__name__, attr))
return getattr(self.__unicode__(), attr)

if py3k:
maketrans = str.maketrans # Static method can't rely on __getattr__
maketrans = str.maketrans # Static method can't rely on __getattr__


del inheritdoc

+ 0
- 3
mwparserfromhell/utils.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -25,9 +24,7 @@ This module contains accessory functions for other parts of the library. Parser
users generally won't need stuff from here.
"""

from __future__ import unicode_literals

from .compat import bytes, str
from .nodes import Node
from .smart_list import SmartList


+ 3
- 8
mwparserfromhell/wikicode.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,12 +19,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

import re
from itertools import chain

from .compat import bytes, py3k, range, str
from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity,
Node, Tag, Template, Text, Wikilink)
from .smart_list.ListProxy import _ListProxy
@@ -49,7 +45,7 @@ class Wikicode(StringMixIn):
RECURSE_OTHERS = 2

def __init__(self, nodes):
super(Wikicode, self).__init__()
super().__init__()
self._nodes = nodes

def __unicode__(self):
@@ -64,8 +60,7 @@ class Wikicode(StringMixIn):
for code in node.__children__():
for child in code.nodes:
sub = Wikicode._get_children(child, contexts, restrict, code)
for result in sub:
yield result
yield from sub

@staticmethod
def _slice_replace(code, index, old, new):
@@ -253,7 +248,7 @@ class Wikicode(StringMixIn):
self.ifilter(forcetype=ftype, *a, **kw))
make_filter = lambda ftype: (lambda self, *a, **kw:
self.filter(forcetype=ftype, *a, **kw))
for name, ftype in (meths.items() if py3k else meths.iteritems()):
for name, ftype in meths.items():
ifilter = make_ifilter(ftype)
filter = make_filter(ftype)
ifilter.__doc__ = doc.format(name, "ifilter", ftype)

+ 0
- 3
scripts/memtest.py View File

@@ -40,7 +40,6 @@ import sys

import psutil

from mwparserfromhell.compat import py3k
from mwparserfromhell.parser._tokenizer import CTokenizer

if sys.version_info[0] == 2:
@@ -88,8 +87,6 @@ class MemoryTest(object):
def load_file(filename):
with open(filename, "rU") as fp:
text = fp.read()
if not py3k:
text = text.decode("utf8")
name = path.split(filename)[1][:0-len(extension)]
self._parse_file(name, text)


+ 3
- 9
setup.py View File

@@ -1,5 +1,4 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -21,23 +20,20 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import print_function
from distutils.errors import DistutilsError, CCompilerError
from glob import glob
from os import environ
import sys

if ((sys.version_info[0] == 2 and sys.version_info[1] < 7) or
(sys.version_info[1] == 3 and sys.version_info[1] < 4)):
raise RuntimeError("mwparserfromhell needs Python 2.7 or 3.4+")
if sys.version_info[1] == 3 and sys.version_info[1] < 4:
raise RuntimeError("mwparserfromhell needs 3.4+")

from setuptools import setup, find_packages, Extension
from setuptools.command.build_ext import build_ext

from mwparserfromhell import __version__
from mwparserfromhell.compat import py3k

with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp:
with open("README.rst", encoding='utf-8') as fp:
long_docs = fp.read()

use_extension = True
@@ -98,8 +94,6 @@ setup(
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",

+ 0
- 1
tests/__init__.py View File

@@ -1 +0,0 @@
# -*- coding: utf-8 -*-

+ 1
- 6
tests/_test_tokenizer.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,13 +19,11 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import codecs
from os import listdir, path
import sys
import warnings

from mwparserfromhell.compat import py3k, str
from mwparserfromhell.parser import tokens
from mwparserfromhell.parser.builder import Builder

@@ -35,7 +32,7 @@ class _TestParseError(Exception):
pass


class TokenizerTestCase(object):
class TokenizerTestCase:
"""A base test case for tokenizers, whose tests are loaded dynamically.

Subclassed along with unittest.TestCase to form TestPyTokenizer and
@@ -60,8 +57,6 @@ class TokenizerTestCase(object):
actual = self.tokenizer().tokenize(data["input"])
self.assertEqual(expected, actual)

if not py3k:
inner.__name__ = funcname.encode("utf8")
inner.__doc__ = data["label"]
return inner


+ 0
- 3
tests/_test_tree_equality.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
from unittest import TestCase

from mwparserfromhell.compat import range
from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity,
Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes.extras import Attribute, Parameter

+ 0
- 18
tests/compat.py View File

@@ -1,18 +0,0 @@
# -*- coding: utf-8 -*-

"""
Serves the same purpose as mwparserfromhell.compat, but only for objects
required by unit tests. This avoids unnecessary imports (like urllib) within
the main library.
"""

from mwparserfromhell.compat import py3k

if py3k:
from io import StringIO
from urllib.parse import urlencode
from urllib.request import urlopen

else:
from StringIO import StringIO
from urllib import urlencode, urlopen

+ 0
- 3
tests/test_argument.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import unittest

from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Argument, Text

from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext

+ 0
- 3
tests/test_attribute.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import unittest

from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Template
from mwparserfromhell.nodes.extras import Attribute


+ 1
- 5
tests/test_builder.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import unittest

from mwparserfromhell.compat import py3k
from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading,
HTMLEntity, Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes.extras import Attribute, Parameter
@@ -428,9 +425,8 @@ class TestBuilder(TreeEqualityTestCase):
[tokens.TagOpenOpen()]
]

func = self.assertRaisesRegex if py3k else self.assertRaisesRegexp
msg = r"_handle_token\(\) got unexpected TemplateClose"
func(ParserError, msg, self.builder.build, [tokens.TemplateClose()])
self.assertRaisesRegex(ParserError, msg, self.builder.build, [tokens.TemplateClose()])
for test in missing_closes:
self.assertRaises(ParserError, self.builder.build, test)


+ 0
- 3
tests/test_comment.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,10 +19,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import unittest

from mwparserfromhell.compat import str
from mwparserfromhell.nodes import Comment

from ._test_tree_equality import TreeEqualityTestCase

+ 0
- 2
tests/test_ctokenizer.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,7 +19,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
import unittest

try:

+ 11
- 29
tests/test_docs.py View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
#
@@ -20,15 +19,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import print_function, unicode_literals
import json
from io import StringIO
import os
import unittest
from urllib.parse import urlencode
from urllib.request import urlopen

import mwparserfromhell
from mwparserfromhell.compat import py3k, str

from .compat import StringIO, urlencode, urlopen

class TestDocs(unittest.TestCase):
"""Integration test cases for mwparserfromhell's documentation."""
@@ -47,16 +45,10 @@ class TestDocs(unittest.TestCase):