@@ -1,7 +1,6 @@ | |||
dist: xenial | |||
language: python | |||
python: | |||
- 2.7 | |||
- 3.4 | |||
- 3.5 | |||
- 3.6 | |||
@@ -11,7 +11,7 @@ mwparserfromhell | |||
**mwparserfromhell** (the *MediaWiki Parser from Hell*) is a Python package | |||
that provides an easy-to-use and outrageously powerful parser for MediaWiki_ | |||
wikicode. It supports Python 2 and Python 3. | |||
wikicode. It supports Python 3.4+. | |||
Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. | |||
Full documentation is available on ReadTheDocs_. Development occurs on GitHub_. | |||
@@ -41,7 +41,7 @@ Normal usage is rather straightforward (where ``text`` is page text): | |||
>>> wikicode = mwparserfromhell.parse(text) | |||
``wikicode`` is a ``mwparserfromhell.Wikicode`` object, which acts like an | |||
ordinary ``str`` object (or ``unicode`` in Python 2) with some extra methods. | |||
ordinary ``str`` object with some extra methods. | |||
For example: | |||
>>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" | |||
@@ -111,8 +111,6 @@ saving the page!) by calling ``str()`` on it: | |||
>>> text == code | |||
True | |||
Likewise, use ``unicode(code)`` in Python 2. | |||
Limitations | |||
----------- | |||
@@ -22,14 +22,6 @@ environment: | |||
secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+ | |||
matrix: | |||
- PYTHON: "C:\\Python27" | |||
PYTHON_VERSION: "2.7" | |||
PYTHON_ARCH: "32" | |||
- PYTHON: "C:\\Python27-x64" | |||
PYTHON_VERSION: "2.7" | |||
PYTHON_ARCH: "64" | |||
- PYTHON: "C:\\Python34" | |||
PYTHON_VERSION: "3.4" | |||
PYTHON_ARCH: "32" | |||
@@ -3,7 +3,7 @@ MWParserFromHell v\ |version| Documentation | |||
:mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python package | |||
that provides an easy-to-use and outrageously powerful parser for MediaWiki_ | |||
wikicode. It supports Python 2 and Python 3. | |||
wikicode. It supports Python 3.4+. | |||
Developed by Earwig_ with contributions from `Σ`_, Legoktm_, and others. | |||
Development occurs on GitHub_. | |||
@@ -7,8 +7,7 @@ Normal usage is rather straightforward (where ``text`` is page text):: | |||
>>> wikicode = mwparserfromhell.parse(text) | |||
``wikicode`` is a :class:`mwparserfromhell.Wikicode <.Wikicode>` object, which | |||
acts like an ordinary ``str`` object (or ``unicode`` in Python 2) with some | |||
extra methods. For example:: | |||
acts like an ordinary ``str`` object with some extra methods. For example:: | |||
>>> text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" | |||
>>> wikicode = mwparserfromhell.parse(text) | |||
@@ -78,7 +77,6 @@ saving the page!) by calling :func:`str` on it:: | |||
>>> text == code | |||
True | |||
(Likewise, use :func:`unicode(code) <unicode>` in Python 2.) | |||
For more tips, check out :class:`Wikicode's full method list <.Wikicode>` and | |||
the :mod:`list of Nodes <.nodes>`. |
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -32,7 +31,7 @@ __license__ = "MIT License" | |||
__version__ = "0.6.dev0" | |||
__email__ = "ben.kurtovic@gmail.com" | |||
from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | |||
from . import (definitions, nodes, parser, smart_list, string_mixin, | |||
utils, wikicode) | |||
parse = utils.parse_anything |
@@ -1,27 +0,0 @@ | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Implements support for both Python 2 and Python 3 by defining common types in | |||
terms of their Python 2/3 variants. For example, :class:`str` is set to | |||
:class:`unicode` on Python 2 but :class:`str` on Python 3; likewise, | |||
:class:`bytes` is :class:`str` on 2 but :class:`bytes` on 3. These types are | |||
meant to be imported directly from within the parser's modules. | |||
""" | |||
import sys | |||
py3k = (sys.version_info[0] == 3) | |||
if py3k: | |||
bytes = bytes | |||
str = str | |||
range = range | |||
import html.entities as htmlentities | |||
else: | |||
bytes = str | |||
str = unicode | |||
range = xrange | |||
import htmlentitydefs as htmlentities | |||
del sys |
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -28,7 +27,6 @@ When updating this file, please also update the the C tokenizer version: | |||
- mwparserfromhell/parser/ctokenizer/definitions.h | |||
""" | |||
from __future__ import unicode_literals | |||
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", | |||
"is_single_only", "is_scheme"] | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -29,9 +28,7 @@ the name of a :class:`.Template` is a :class:`.Wikicode` object that can | |||
contain text or more templates. | |||
""" | |||
from __future__ import unicode_literals | |||
from ..compat import str | |||
from ..string_mixin import StringMixIn | |||
__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading", | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import Node | |||
from ..compat import str | |||
from ..utils import parse_anything | |||
__all__ = ["Argument"] | |||
@@ -32,7 +29,7 @@ class Argument(Node): | |||
"""Represents a template argument substitution, like ``{{{foo}}}``.""" | |||
def __init__(self, name, default=None): | |||
super(Argument, self).__init__() | |||
super().__init__() | |||
self.name = name | |||
self.default = default | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import Node | |||
from ..compat import str | |||
__all__ = ["Comment"] | |||
@@ -31,7 +28,7 @@ class Comment(Node): | |||
"""Represents a hidden HTML comment, like ``<!-- foobar -->``.""" | |||
def __init__(self, contents): | |||
super(Comment, self).__init__() | |||
super().__init__() | |||
self.contents = contents | |||
def __unicode__(self): | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import Node | |||
from ..compat import str | |||
from ..utils import parse_anything | |||
__all__ = ["ExternalLink"] | |||
@@ -32,7 +29,7 @@ class ExternalLink(Node): | |||
"""Represents an external link, like ``[http://example.com/ Example]``.""" | |||
def __init__(self, url, title=None, brackets=True): | |||
super(ExternalLink, self).__init__() | |||
super().__init__() | |||
self.url = url | |||
self.title = title | |||
self.brackets = brackets | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,9 +19,7 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from ...compat import str | |||
from ...string_mixin import StringMixIn | |||
from ...utils import parse_anything | |||
@@ -38,7 +35,7 @@ class Attribute(StringMixIn): | |||
def __init__(self, name, value=None, quotes='"', pad_first=" ", | |||
pad_before_eq="", pad_after_eq=""): | |||
super(Attribute, self).__init__() | |||
super().__init__() | |||
self.name = name | |||
self._quotes = None | |||
self.value = value | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
import re | |||
from ...compat import str | |||
from ...string_mixin import StringMixIn | |||
from ...utils import parse_anything | |||
@@ -39,7 +36,7 @@ class Parameter(StringMixIn): | |||
""" | |||
def __init__(self, name, value, showkey=True): | |||
super(Parameter, self).__init__() | |||
super().__init__() | |||
self.name = name | |||
self.value = value | |||
self.showkey = showkey | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import Node | |||
from ..compat import str | |||
from ..utils import parse_anything | |||
__all__ = ["Heading"] | |||
@@ -32,7 +29,7 @@ class Heading(Node): | |||
"""Represents a section heading in wikicode, like ``== Foo ==``.""" | |||
def __init__(self, title, level): | |||
super(Heading, self).__init__() | |||
super().__init__() | |||
self.title = title | |||
self.level = level | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,9 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
import html.entities as htmlentities | |||
from . import Node | |||
from ..compat import htmlentities, py3k, str | |||
__all__ = ["HTMLEntity"] | |||
@@ -31,7 +29,7 @@ class HTMLEntity(Node): | |||
"""Represents an HTML entity, like `` ``, either named or unnamed.""" | |||
def __init__(self, value, named=None, hexadecimal=False, hex_char="x"): | |||
super(HTMLEntity, self).__init__() | |||
super().__init__() | |||
self._value = value | |||
if named is None: # Try to guess whether or not the entity is named | |||
try: | |||
@@ -63,32 +61,6 @@ class HTMLEntity(Node): | |||
return self.normalize() | |||
return self | |||
if not py3k: | |||
@staticmethod | |||
def _unichr(value): | |||
"""Implement builtin unichr() with support for non-BMP code points. | |||
On wide Python builds, this functions like the normal unichr(). On | |||
narrow builds, this returns the value's encoded surrogate pair. | |||
""" | |||
try: | |||
return unichr(value) | |||
except ValueError: | |||
# Test whether we're on the wide or narrow Python build. Check | |||
# the length of a non-BMP code point | |||
# (U+1F64A, SPEAK-NO-EVIL MONKEY): | |||
if len("\U0001F64A") == 1: # pragma: no cover | |||
raise | |||
# Ensure this is within the range we can encode: | |||
if value > 0x10FFFF: | |||
raise ValueError("unichr() arg not in range(0x110000)") | |||
code = value - 0x10000 | |||
if value < 0: # Invalid code point | |||
raise | |||
lead = 0xD800 + (code >> 10) | |||
trail = 0xDC00 + (code % (1 << 10)) | |||
return unichr(lead) + unichr(trail) | |||
@property | |||
def value(self): | |||
"""The string value of the HTML entity.""" | |||
@@ -173,9 +145,8 @@ class HTMLEntity(Node): | |||
def normalize(self): | |||
"""Return the unicode character represented by the HTML entity.""" | |||
chrfunc = chr if py3k else HTMLEntity._unichr | |||
if self.named: | |||
return chrfunc(htmlentities.name2codepoint[self.value]) | |||
return chr(htmlentities.name2codepoint[self.value]) | |||
if self.hexadecimal: | |||
return chrfunc(int(self.value, 16)) | |||
return chrfunc(int(self.value)) | |||
return chr(int(self.value, 16)) | |||
return chr(int(self.value)) |
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,11 +19,9 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import Node | |||
from .extras import Attribute | |||
from ..compat import str | |||
from ..definitions import is_visible | |||
from ..utils import parse_anything | |||
@@ -37,7 +34,7 @@ class Tag(Node): | |||
self_closing=False, invalid=False, implicit=False, padding="", | |||
closing_tag=None, wiki_style_separator=None, | |||
closing_wiki_markup=None): | |||
super(Tag, self).__init__() | |||
super().__init__() | |||
self.tag = tag | |||
self.contents = contents | |||
self._attrs = attrs if attrs else [] | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,13 +19,11 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from collections import defaultdict | |||
import re | |||
from . import HTMLEntity, Node, Text | |||
from .extras import Parameter | |||
from ..compat import range, str | |||
from ..utils import parse_anything | |||
__all__ = ["Template"] | |||
@@ -37,7 +34,7 @@ class Template(Node): | |||
"""Represents a template in wikicode, like ``{{foo}}``.""" | |||
def __init__(self, name, params=None): | |||
super(Template, self).__init__() | |||
super().__init__() | |||
self.name = name | |||
if params: | |||
self._params = params | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import Node | |||
from ..compat import str | |||
__all__ = ["Text"] | |||
@@ -31,7 +28,7 @@ class Text(Node): | |||
"""Represents ordinary, unformatted text with no special properties.""" | |||
def __init__(self, value): | |||
super(Text, self).__init__() | |||
super().__init__() | |||
self.value = value | |||
def __unicode__(self): | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import Node | |||
from ..compat import str | |||
from ..utils import parse_anything | |||
__all__ = ["Wikilink"] | |||
@@ -32,7 +29,7 @@ class Wikilink(Node): | |||
"""Represents an internal wikilink, like ``[[Foo|Bar]]``.""" | |||
def __init__(self, title, text=None): | |||
super(Wikilink, self).__init__() | |||
super().__init__() | |||
self.title = title | |||
self.text = text | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -36,7 +35,7 @@ class ParserError(Exception): | |||
""" | |||
def __init__(self, extra): | |||
msg = "This is a bug and should be reported. Info: {}.".format(extra) | |||
super(ParserError, self).__init__(msg) | |||
super().__init__(msg) | |||
from .builder import Builder | |||
@@ -50,7 +49,7 @@ except ImportError: | |||
__all__ = ["use_c", "Parser", "ParserError"] | |||
class Parser(object): | |||
class Parser: | |||
"""Represents a parser for wikicode. | |||
Actual parsing is a two-step process: first, the text is split up into a | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,10 +19,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import tokens, ParserError | |||
from ..compat import str | |||
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, | |||
Template, Text, Wikilink) | |||
from ..nodes.extras import Attribute, Parameter | |||
@@ -45,7 +42,7 @@ def _add_handler(token_type): | |||
return decorator | |||
class Builder(object): | |||
class Builder: | |||
"""Builds a tree of nodes out of a sequence of tokens. | |||
To use, pass a list of :class:`.Token`\\ s to the :meth:`build` method. The | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -23,7 +23,7 @@ SOFTWARE. | |||
#pragma once | |||
#ifndef PY_SSIZE_T_CLEAN | |||
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/2/c-api/arg.html | |||
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html | |||
#endif | |||
#include <Python.h> | |||
@@ -34,10 +34,6 @@ SOFTWARE. | |||
/* Compatibility macros */ | |||
#if PY_MAJOR_VERSION >= 3 | |||
#define IS_PY3K | |||
#endif | |||
#ifndef uint64_t | |||
#define uint64_t unsigned PY_LONG_LONG | |||
#endif | |||
@@ -48,20 +44,8 @@ SOFTWARE. | |||
/* Unicode support macros */ | |||
#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3 | |||
#define PEP_393 | |||
#endif | |||
#ifdef PEP_393 | |||
#define Unicode Py_UCS4 | |||
#define PyUnicode_FROM_SINGLE(chr) \ | |||
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) | |||
#else | |||
#define Unicode Py_UNICODE | |||
#define PyUnicode_FROM_SINGLE(chr) \ | |||
PyUnicode_FromUnicode(&(chr), 1) | |||
#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE | |||
#endif | |||
/* Error handling macros */ | |||
@@ -85,13 +69,9 @@ extern PyObject* definitions; | |||
typedef struct { | |||
Py_ssize_t capacity; | |||
Py_ssize_t length; | |||
#ifdef PEP_393 | |||
PyObject* object; | |||
int kind; | |||
void* data; | |||
#else | |||
Py_UNICODE* data; | |||
#endif | |||
} Textbuffer; | |||
typedef struct { | |||
@@ -111,12 +91,8 @@ typedef struct Stack Stack; | |||
typedef struct { | |||
PyObject* object; /* base PyUnicodeObject object */ | |||
Py_ssize_t length; /* length of object, in code points */ | |||
#ifdef PEP_393 | |||
int kind; /* object's kind value */ | |||
void* data; /* object's raw unicode buffer */ | |||
#else | |||
Py_UNICODE* buf; /* object's internal buffer */ | |||
#endif | |||
} TokenizerInput; | |||
typedef struct avl_tree_node avl_tree; | |||
@@ -32,7 +32,7 @@ typedef struct { | |||
Textbuffer* pad_first; | |||
Textbuffer* pad_before_eq; | |||
Textbuffer* pad_after_eq; | |||
Unicode quoter; | |||
Py_UCS4 quoter; | |||
Py_ssize_t reset; | |||
} TagData; | |||
@@ -29,23 +29,16 @@ SOFTWARE. | |||
/* | |||
Internal allocation function for textbuffers. | |||
*/ | |||
static int internal_alloc(Textbuffer* self, Unicode maxchar) | |||
static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar) | |||
{ | |||
self->capacity = INITIAL_CAPACITY; | |||
self->length = 0; | |||
#ifdef PEP_393 | |||
self->object = PyUnicode_New(self->capacity, maxchar); | |||
if (!self->object) | |||
return -1; | |||
self->kind = PyUnicode_KIND(self->object); | |||
self->data = PyUnicode_DATA(self->object); | |||
#else | |||
(void) maxchar; // Unused | |||
self->data = malloc(sizeof(Unicode) * self->capacity); | |||
if (!self->data) | |||
return -1; | |||
#endif | |||
return 0; | |||
} | |||
@@ -55,11 +48,7 @@ static int internal_alloc(Textbuffer* self, Unicode maxchar) | |||
*/ | |||
static void internal_dealloc(Textbuffer* self) | |||
{ | |||
#ifdef PEP_393 | |||
Py_DECREF(self->object); | |||
#else | |||
free(self->data); | |||
#endif | |||
} | |||
/* | |||
@@ -67,7 +56,6 @@ static void internal_dealloc(Textbuffer* self) | |||
*/ | |||
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) | |||
{ | |||
#ifdef PEP_393 | |||
PyObject *newobj; | |||
void *newdata; | |||
@@ -79,10 +67,6 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) | |||
Py_DECREF(self->object); | |||
self->object = newobj; | |||
self->data = newdata; | |||
#else | |||
if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap))) | |||
return -1; | |||
#endif | |||
self->capacity = new_cap; | |||
return 0; | |||
@@ -94,11 +78,9 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) | |||
Textbuffer* Textbuffer_new(TokenizerInput* text) | |||
{ | |||
Textbuffer* self = malloc(sizeof(Textbuffer)); | |||
Unicode maxchar = 0; | |||
Py_UCS4 maxchar = 0; | |||
#ifdef PEP_393 | |||
maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); | |||
#endif | |||
if (!self) | |||
goto fail_nomem; | |||
@@ -127,11 +109,9 @@ void Textbuffer_dealloc(Textbuffer* self) | |||
*/ | |||
int Textbuffer_reset(Textbuffer* self) | |||
{ | |||
Unicode maxchar = 0; | |||
Py_UCS4 maxchar = 0; | |||
#ifdef PEP_393 | |||
maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); | |||
#endif | |||
internal_dealloc(self); | |||
if (internal_alloc(self, maxchar)) | |||
@@ -142,18 +122,14 @@ int Textbuffer_reset(Textbuffer* self) | |||
/* | |||
Write a Unicode codepoint to the given textbuffer. | |||
*/ | |||
int Textbuffer_write(Textbuffer* self, Unicode code) | |||
int Textbuffer_write(Textbuffer* self, Py_UCS4 code) | |||
{ | |||
if (self->length >= self->capacity) { | |||
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) | |||
return -1; | |||
} | |||
#ifdef PEP_393 | |||
PyUnicode_WRITE(self->kind, self->data, self->length++, code); | |||
#else | |||
self->data[self->length++] = code; | |||
#endif | |||
return 0; | |||
} | |||
@@ -163,13 +139,9 @@ int Textbuffer_write(Textbuffer* self, Unicode code) | |||
This function does not check for bounds. | |||
*/ | |||
Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index) | |||
Py_UCS4 Textbuffer_read(Textbuffer* self, Py_ssize_t index) | |||
{ | |||
#ifdef PEP_393 | |||
return PyUnicode_READ(self->kind, self->data, index); | |||
#else | |||
return self->data[index]; | |||
#endif | |||
} | |||
/* | |||
@@ -177,11 +149,7 @@ Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index) | |||
*/ | |||
PyObject* Textbuffer_render(Textbuffer* self) | |||
{ | |||
#ifdef PEP_393 | |||
return PyUnicode_FromKindAndData(self->kind, self->data, self->length); | |||
#else | |||
return PyUnicode_FromUnicode(self->data, self->length); | |||
#endif | |||
} | |||
/* | |||
@@ -196,14 +164,9 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other) | |||
return -1; | |||
} | |||
#ifdef PEP_393 | |||
assert(self->kind == other->kind); | |||
memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data, | |||
other->length * other->kind); | |||
#else | |||
memcpy(self->data + self->length, other->data, | |||
other->length * sizeof(Unicode)); | |||
#endif | |||
self->length = newlen; | |||
return 0; | |||
@@ -215,18 +178,12 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other) | |||
void Textbuffer_reverse(Textbuffer* self) | |||
{ | |||
Py_ssize_t i, end = self->length - 1; | |||
Unicode tmp; | |||
Py_UCS4 tmp; | |||
for (i = 0; i < self->length / 2; i++) { | |||
#ifdef PEP_393 | |||
tmp = PyUnicode_READ(self->kind, self->data, i); | |||
PyUnicode_WRITE(self->kind, self->data, i, | |||
PyUnicode_READ(self->kind, self->data, end - i)); | |||
PyUnicode_WRITE(self->kind, self->data, end - i, tmp); | |||
#else | |||
tmp = self->data[i]; | |||
self->data[i] = self->data[end - i]; | |||
self->data[end - i] = tmp; | |||
#endif | |||
} | |||
} |
@@ -29,8 +29,8 @@ SOFTWARE. | |||
Textbuffer* Textbuffer_new(TokenizerInput*); | |||
void Textbuffer_dealloc(Textbuffer*); | |||
int Textbuffer_reset(Textbuffer*); | |||
int Textbuffer_write(Textbuffer*, Unicode); | |||
Unicode Textbuffer_read(Textbuffer*, Py_ssize_t); | |||
int Textbuffer_write(Textbuffer*, Py_UCS4); | |||
Py_UCS4 Textbuffer_read(Textbuffer*, Py_ssize_t); | |||
PyObject* Textbuffer_render(Textbuffer*); | |||
int Textbuffer_concat(Textbuffer*, Textbuffer*); | |||
void Textbuffer_reverse(Textbuffer*); |
@@ -52,7 +52,7 @@ static int Tokenizer_parse_tag(Tokenizer*); | |||
/* | |||
Determine whether the given code point is a marker. | |||
*/ | |||
static int is_marker(Unicode this) | |||
static int is_marker(Py_UCS4 this) | |||
{ | |||
int i; | |||
@@ -442,7 +442,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||
static const char* valid = URISCHEME; | |||
Textbuffer* buffer; | |||
PyObject* scheme; | |||
Unicode this; | |||
Py_UCS4 this; | |||
int slashes, i; | |||
if (Tokenizer_check_route(self, LC_EXT_LINK_URI) < 0) | |||
@@ -463,7 +463,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||
while (1) { | |||
if (!valid[i]) | |||
goto end_of_loop; | |||
if (this == (Unicode) valid[i]) | |||
if (this == (Py_UCS4) valid[i]) | |||
break; | |||
i++; | |||
} | |||
@@ -516,7 +516,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
static const char* valid = URISCHEME; | |||
Textbuffer *scheme_buffer = Textbuffer_new(&self->text); | |||
PyObject *scheme; | |||
Unicode chunk; | |||
Py_UCS4 chunk; | |||
Py_ssize_t i; | |||
int slashes, j; | |||
uint64_t new_context; | |||
@@ -536,7 +536,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
FAIL_ROUTE(0); | |||
return 0; | |||
} | |||
} while (chunk != (Unicode) valid[j++]); | |||
} while (chunk != (Py_UCS4) valid[j++]); | |||
Textbuffer_write(scheme_buffer, chunk); | |||
} | |||
end_of_loop: | |||
@@ -580,7 +580,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
Handle text in a free external link, including trailing punctuation. | |||
*/ | |||
static int Tokenizer_handle_free_link_text( | |||
Tokenizer* self, int* parens, Textbuffer* tail, Unicode this) | |||
Tokenizer* self, int* parens, Textbuffer* tail, Py_UCS4 this) | |||
{ | |||
#define PUSH_TAIL_BUFFER(tail, error) \ | |||
if (tail && tail->length > 0) { \ | |||
@@ -607,10 +607,10 @@ static int Tokenizer_handle_free_link_text( | |||
Return whether the current head is the end of a free link. | |||
*/ | |||
static int | |||
Tokenizer_is_free_link(Tokenizer* self, Unicode this, Unicode next) | |||
Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||
{ | |||
// Built from Tokenizer_parse()'s end sentinels: | |||
Unicode after = Tokenizer_read(self, 2); | |||
Py_UCS4 after = Tokenizer_read(self, 2); | |||
uint64_t ctx = self->topstack->context; | |||
return (!this || this == '\n' || this == '[' || this == ']' || | |||
@@ -628,7 +628,7 @@ static PyObject* | |||
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||
Textbuffer* extra) | |||
{ | |||
Unicode this, next; | |||
Py_UCS4 this, next; | |||
int parens = 0; | |||
if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : | |||
@@ -816,11 +816,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) | |||
if (!heading) { | |||
return -1; | |||
} | |||
#ifdef IS_PY3K | |||
level = PyLong_FromSsize_t(heading->level); | |||
#else | |||
level = PyInt_FromSsize_t(heading->level); | |||
#endif | |||
if (!level) { | |||
Py_DECREF(heading->title); | |||
free(heading); | |||
@@ -933,7 +929,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) | |||
static int Tokenizer_really_parse_entity(Tokenizer* self) | |||
{ | |||
PyObject *kwargs, *charobj, *textobj; | |||
Unicode this; | |||
Py_UCS4 this; | |||
int numeric, hexadecimal, i, j, zeroes, test; | |||
char *valid, *text, *buffer, *def; | |||
@@ -1014,7 +1010,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) | |||
while (1) { | |||
if (!valid[j]) | |||
FAIL_ROUTE_AND_EXIT() | |||
if (this == (Unicode) valid[j]) | |||
if (this == (Py_UCS4) valid[j]) | |||
break; | |||
j++; | |||
} | |||
@@ -1111,7 +1107,7 @@ static int Tokenizer_parse_comment(Tokenizer* self) | |||
{ | |||
Py_ssize_t reset = self->head + 3; | |||
PyObject *comment; | |||
Unicode this; | |||
Py_UCS4 this; | |||
self->head += 4; | |||
if (Tokenizer_push(self, 0)) | |||
@@ -1211,7 +1207,7 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) | |||
Handle whitespace inside of an HTML open tag. | |||
*/ | |||
static int Tokenizer_handle_tag_space( | |||
Tokenizer* self, TagData* data, Unicode text) | |||
Tokenizer* self, TagData* data, Py_UCS4 text) | |||
{ | |||
uint64_t ctx = data->context; | |||
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && | |||
@@ -1243,9 +1239,9 @@ static int Tokenizer_handle_tag_space( | |||
/* | |||
Handle regular text inside of an HTML open tag. | |||
*/ | |||
static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text) | |||
static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UCS4 text) | |||
{ | |||
Unicode next = Tokenizer_read(self, 1); | |||
Py_UCS4 next = Tokenizer_read(self, 1); | |||
if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self)) | |||
return Tokenizer_emit_char(self, text); | |||
@@ -1262,7 +1258,7 @@ static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text) | |||
Handle all sorts of text data inside of an HTML open tag. | |||
*/ | |||
static int Tokenizer_handle_tag_data( | |||
Tokenizer* self, TagData* data, Unicode chunk) | |||
Tokenizer* self, TagData* data, Py_UCS4 chunk) | |||
{ | |||
PyObject *trash; | |||
int first_time, escaped; | |||
@@ -1444,7 +1440,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self) | |||
{ | |||
Textbuffer* buffer; | |||
PyObject *buf_tmp, *end_tag, *start_tag; | |||
Unicode this, next; | |||
Py_UCS4 this, next; | |||
Py_ssize_t reset; | |||
int cmp; | |||
@@ -1600,7 +1596,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) | |||
{ | |||
TagData *data = TagData_new(&self->text); | |||
PyObject *token, *text, *trash; | |||
Unicode this, next; | |||
Py_UCS4 this, next; | |||
int can_exit; | |||
if (!data) | |||
@@ -1686,7 +1682,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) | |||
Py_ssize_t reset = self->head + 1, pos = 0; | |||
Textbuffer* buf; | |||
PyObject *name, *tag; | |||
Unicode this; | |||
Py_UCS4 this; | |||
self->head += 2; | |||
buf = Textbuffer_new(&self->text); | |||
@@ -1988,7 +1984,7 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self) | |||
static int Tokenizer_handle_list_marker(Tokenizer* self) | |||
{ | |||
PyObject *kwargs, *markup; | |||
Unicode code = Tokenizer_read(self, 0); | |||
Py_UCS4 code = Tokenizer_read(self, 0); | |||
if (code == ';') | |||
self->topstack->context |= LC_DLTERM; | |||
@@ -2015,7 +2011,7 @@ static int Tokenizer_handle_list_marker(Tokenizer* self) | |||
*/ | |||
static int Tokenizer_handle_list(Tokenizer* self) | |||
{ | |||
Unicode marker = Tokenizer_read(self, 1); | |||
Py_UCS4 marker = Tokenizer_read(self, 1); | |||
if (Tokenizer_handle_list_marker(self)) | |||
return -1; | |||
@@ -2169,11 +2165,11 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, | |||
/* | |||
Handle style attributes for a table until an ending token. | |||
*/ | |||
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token) | |||
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Py_UCS4 end_token) | |||
{ | |||
TagData *data = TagData_new(&self->text); | |||
PyObject *padding, *trash; | |||
Unicode this; | |||
Py_UCS4 this; | |||
int can_exit; | |||
if (!data) | |||
@@ -2483,7 +2479,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) | |||
everything is safe, or -1 if the route must be failed. | |||
*/ | |||
static int | |||
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data) | |||
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UCS4 data) | |||
{ | |||
if (context & LC_FAIL_NEXT) | |||
return -1; | |||
@@ -2568,7 +2564,7 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data) | |||
static int Tokenizer_has_leading_whitespace(Tokenizer* self) | |||
{ | |||
int offset = 1; | |||
Unicode current_character; | |||
Py_UCS4 current_character; | |||
while (1) { | |||
current_character = Tokenizer_read_backwards(self, offset); | |||
if (!current_character || current_character == '\n') | |||
@@ -2586,7 +2582,7 @@ static int Tokenizer_has_leading_whitespace(Tokenizer* self) | |||
PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) | |||
{ | |||
uint64_t this_context; | |||
Unicode this, next, next_next, last; | |||
Py_UCS4 this, next, next_next, last; | |||
PyObject* temp; | |||
if (push) { | |||
@@ -24,7 +24,7 @@ SOFTWARE. | |||
#include "common.h" | |||
static const Unicode MARKERS[] = { | |||
static const Py_UCS4 MARKERS[] = { | |||
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', | |||
'-', '!', '\n', '\0'}; | |||
@@ -275,7 +275,7 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, | |||
/* | |||
Write a Unicode codepoint to the current textbuffer. | |||
*/ | |||
int Tokenizer_emit_char(Tokenizer* self, Unicode code) | |||
int Tokenizer_emit_char(Tokenizer* self, Py_UCS4 code) | |||
{ | |||
return Textbuffer_write(self->topstack->textbuffer, code); | |||
} | |||
@@ -389,19 +389,15 @@ int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) | |||
/* | |||
Internal function to read the codepoint at the given index from the input. | |||
*/ | |||
static Unicode read_codepoint(TokenizerInput* text, Py_ssize_t index) | |||
static Py_UCS4 read_codepoint(TokenizerInput* text, Py_ssize_t index) | |||
{ | |||
#ifdef PEP_393 | |||
return PyUnicode_READ(text->kind, text->data, index); | |||
#else | |||
return text->buf[index]; | |||
#endif | |||
} | |||
/* | |||
Read the value at a relative point in the wikicode, forwards. | |||
*/ | |||
Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta) | |||
Py_UCS4 Tokenizer_read(Tokenizer* self, Py_ssize_t delta) | |||
{ | |||
Py_ssize_t index = self->head + delta; | |||
@@ -413,7 +409,7 @@ Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta) | |||
/* | |||
Read the value at a relative point in the wikicode, backwards. | |||
*/ | |||
Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) | |||
Py_UCS4 Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) | |||
{ | |||
Py_ssize_t index; | |||
@@ -38,14 +38,14 @@ void Tokenizer_free_bad_route_tree(Tokenizer*); | |||
int Tokenizer_emit_token(Tokenizer*, PyObject*, int); | |||
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); | |||
int Tokenizer_emit_char(Tokenizer*, Unicode); | |||
int Tokenizer_emit_char(Tokenizer*, Py_UCS4); | |||
int Tokenizer_emit_text(Tokenizer*, const char*); | |||
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*); | |||
int Tokenizer_emit_all(Tokenizer*, PyObject*); | |||
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*); | |||
Unicode Tokenizer_read(Tokenizer*, Py_ssize_t); | |||
Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); | |||
Py_UCS4 Tokenizer_read(Tokenizer*, Py_ssize_t); | |||
Py_UCS4 Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); | |||
/* Macros */ | |||
@@ -85,12 +85,8 @@ static void init_tokenizer_text(TokenizerInput* text) | |||
text->object = Py_None; | |||
Py_INCREF(Py_None); | |||
text->length = 0; | |||
#ifdef PEP_393 | |||
text->kind = PyUnicode_WCHAR_KIND; | |||
text->data = NULL; | |||
#else | |||
text->buf = NULL; | |||
#endif | |||
} | |||
/* | |||
@@ -119,14 +115,10 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input) | |||
dealloc_tokenizer_text(text); | |||
text->object = input; | |||
#ifdef PEP_393 | |||
if (PyUnicode_READY(input) < 0) | |||
return -1; | |||
text->kind = PyUnicode_KIND(input); | |||
text->data = PyUnicode_DATA(input); | |||
#else | |||
text->buf = PyUnicode_AS_UNICODE(input); | |||
#endif | |||
text->length = PyUnicode_GET_LENGTH(input); | |||
return 0; | |||
} | |||
@@ -192,11 +184,9 @@ static int load_entities(void) | |||
{ | |||
PyObject *tempmod, *defmap, *deflist; | |||
unsigned numdefs, i; | |||
#ifdef IS_PY3K | |||
PyObject *string; | |||
#endif | |||
tempmod = PyImport_ImportModule(ENTITYDEFS_MODULE); | |||
tempmod = PyImport_ImportModule("html.entities"); | |||
if (!tempmod) | |||
return -1; | |||
defmap = PyObject_GetAttrString(tempmod, "entitydefs"); | |||
@@ -212,14 +202,10 @@ static int load_entities(void) | |||
if (!entitydefs) | |||
return -1; | |||
for (i = 0; i < numdefs; i++) { | |||
#ifdef IS_PY3K | |||
string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i)); | |||
if (!string) | |||
return -1; | |||
entitydefs[i] = PyBytes_AsString(string); | |||
#else | |||
entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); | |||
#endif | |||
if (!entitydefs[i]) | |||
return -1; | |||
} | |||
@@ -233,7 +219,7 @@ static int load_tokens(void) | |||
*globals = PyEval_GetGlobals(), | |||
*locals = PyEval_GetLocals(), | |||
*fromlist = PyList_New(1), | |||
*modname = IMPORT_NAME_FUNC("tokens"); | |||
*modname = PyUnicode_FromString("tokens"); | |||
char *name = "mwparserfromhell.parser"; | |||
if (!fromlist || !modname) | |||
@@ -256,7 +242,7 @@ static int load_defs(void) | |||
*globals = PyEval_GetGlobals(), | |||
*locals = PyEval_GetLocals(), | |||
*fromlist = PyList_New(1), | |||
*modname = IMPORT_NAME_FUNC("definitions"); | |||
*modname = PyUnicode_FromString("definitions"); | |||
char *name = "mwparserfromhell"; | |||
if (!fromlist || !modname) | |||
@@ -277,7 +263,7 @@ static int load_exceptions(void) | |||
*globals = PyEval_GetGlobals(), | |||
*locals = PyEval_GetLocals(), | |||
*fromlist = PyList_New(1), | |||
*modname = IMPORT_NAME_FUNC("parser"); | |||
*modname = PyUnicode_FromString("parser"); | |||
char *name = "mwparserfromhell"; | |||
if (!fromlist || !modname) | |||
@@ -294,24 +280,22 @@ static int load_exceptions(void) | |||
return 0; | |||
} | |||
PyMODINIT_FUNC INIT_FUNC_NAME(void) | |||
PyMODINIT_FUNC PyInit__tokenizer(void) | |||
{ | |||
PyObject *module; | |||
TokenizerType.tp_new = PyType_GenericNew; | |||
if (PyType_Ready(&TokenizerType) < 0) | |||
INIT_ERROR; | |||
module = CREATE_MODULE; | |||
return NULL; | |||
module = PyModule_Create(&module_def); | |||
if (!module) | |||
INIT_ERROR; | |||
return NULL; | |||
Py_INCREF(&TokenizerType); | |||
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); | |||
Py_INCREF(Py_True); | |||
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); | |||
NOARGS = PyTuple_New(0); | |||
if (!NOARGS || load_entities() || load_tokens() || load_defs()) | |||
INIT_ERROR; | |||
#ifdef IS_PY3K | |||
return NULL; | |||
return module; | |||
#endif | |||
} |
@@ -32,22 +32,6 @@ static void Tokenizer_dealloc(Tokenizer*); | |||
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | |||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | |||
/* Compatibility macros */ | |||
#ifdef IS_PY3K | |||
#define IMPORT_NAME_FUNC PyUnicode_FromString | |||
#define CREATE_MODULE PyModule_Create(&module_def); | |||
#define ENTITYDEFS_MODULE "html.entities" | |||
#define INIT_FUNC_NAME PyInit__tokenizer | |||
#define INIT_ERROR return NULL | |||
#else | |||
#define IMPORT_NAME_FUNC PyBytes_FromString | |||
#define CREATE_MODULE Py_InitModule("_tokenizer", NULL); | |||
#define ENTITYDEFS_MODULE "htmlentitydefs" | |||
#define INIT_FUNC_NAME init_tokenizer | |||
#define INIT_ERROR return | |||
#endif | |||
/* Structs */ | |||
static PyMethodDef Tokenizer_methods[] = { | |||
@@ -101,11 +85,9 @@ static PyTypeObject TokenizerType = { | |||
Tokenizer_new, /* tp_new */ | |||
}; | |||
#ifdef IS_PY3K | |||
static PyModuleDef module_def = { | |||
PyModuleDef_HEAD_INIT, | |||
"_tokenizer", | |||
"Creates a list of tokens from a string of wikicode.", | |||
-1, NULL, NULL, NULL, NULL, NULL | |||
}; | |||
#endif |
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,12 +19,11 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
import html.entities as htmlentities | |||
from math import log | |||
import re | |||
from . import contexts, tokens, ParserError | |||
from ..compat import htmlentities, range | |||
from ..definitions import (get_html_tag, is_parsable, is_single, | |||
is_single_only, is_scheme) | |||
@@ -35,11 +33,11 @@ class BadRoute(Exception): | |||
"""Raised internally when the current tokenization route is invalid.""" | |||
def __init__(self, context=0): | |||
super(BadRoute, self).__init__() | |||
super().__init__() | |||
self.context = context | |||
class _TagOpenData(object): | |||
class _TagOpenData: | |||
"""Stores data about an HTML open tag, like ``<ref name="foo">``.""" | |||
CX_NAME = 1 << 0 | |||
CX_ATTR_READY = 1 << 1 | |||
@@ -57,7 +55,7 @@ class _TagOpenData(object): | |||
self.reset = 0 | |||
class Tokenizer(object): | |||
class Tokenizer: | |||
"""Creates a list of tokens from a string of wikicode.""" | |||
USES_C = False | |||
START = object() | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -28,9 +27,6 @@ a syntactically valid form by the :class:`.Tokenizer`, and then converted into | |||
the :class`.Wikicode` tree by the :class:`.Builder`. | |||
""" | |||
from __future__ import unicode_literals | |||
from ..compat import py3k, str | |||
__all__ = ["Token"] | |||
@@ -65,7 +61,7 @@ class Token(dict): | |||
def make(name): | |||
"""Create a new Token class using ``type()`` and add it to ``__all__``.""" | |||
__all__.append(name) | |||
return type(name if py3k else name.encode("utf8"), (Token,), {}) | |||
return type(name, (Token,), {}) | |||
Text = make("Text") | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com> | |||
@@ -24,7 +23,6 @@ | |||
# SmartList has to be a full import in order to avoid cyclical import errors | |||
import mwparserfromhell.smart_list.SmartList | |||
from .utils import _SliceNormalizerMixIn, inheritdoc | |||
from ..compat import py3k | |||
class _ListProxy(_SliceNormalizerMixIn, list): | |||
@@ -36,7 +34,7 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||
""" | |||
def __init__(self, parent, sliceinfo): | |||
super(_ListProxy, self).__init__() | |||
super().__init__() | |||
self._parent = parent | |||
self._sliceinfo = sliceinfo | |||
@@ -73,12 +71,8 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||
return self._render() >= list(other) | |||
return self._render() >= other | |||
if py3k: | |||
def __bool__(self): | |||
return bool(self._render()) | |||
else: | |||
def __nonzero__(self): | |||
return bool(self._render()) | |||
def __bool__(self): | |||
return bool(self._render()) | |||
def __len__(self): | |||
return max((self._stop - self._start) // self._step, 0) | |||
@@ -138,16 +132,6 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||
def __contains__(self, item): | |||
return item in self._render() | |||
if not py3k: | |||
def __getslice__(self, start, stop): | |||
return self.__getitem__(slice(start, stop)) | |||
def __setslice__(self, start, stop, iterable): | |||
self.__setitem__(slice(start, stop), iterable) | |||
def __delslice__(self, start, stop): | |||
self.__delitem__(slice(start, stop)) | |||
def __add__(self, other): | |||
return mwparserfromhell.smart_list.SmartList(list(self) + other) | |||
@@ -237,27 +221,13 @@ class _ListProxy(_SliceNormalizerMixIn, list): | |||
item.reverse() | |||
self._parent[self._start:self._stop:self._step] = item | |||
if py3k: | |||
@inheritdoc | |||
def sort(self, key=None, reverse=None): | |||
item = self._render() | |||
kwargs = {} | |||
if key is not None: | |||
kwargs["key"] = key | |||
if reverse is not None: | |||
kwargs["reverse"] = reverse | |||
item.sort(**kwargs) | |||
self._parent[self._start:self._stop:self._step] = item | |||
else: | |||
@inheritdoc | |||
def sort(self, cmp=None, key=None, reverse=None): | |||
item = self._render() | |||
kwargs = {} | |||
if cmp is not None: | |||
kwargs["cmp"] = cmp | |||
if key is not None: | |||
kwargs["key"] = key | |||
if reverse is not None: | |||
kwargs["reverse"] = reverse | |||
item.sort(**kwargs) | |||
self._parent[self._start:self._stop:self._step] = item | |||
@inheritdoc | |||
def sort(self, key=None, reverse=None): | |||
item = self._render() | |||
kwargs = {} | |||
if key is not None: | |||
kwargs["key"] = key | |||
if reverse is not None: | |||
kwargs["reverse"] = reverse | |||
item.sort(**kwargs) | |||
self._parent[self._start:self._stop:self._step] = item |
@@ -1,5 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com> | |||
# | |||
@@ -25,7 +23,6 @@ from _weakref import ref | |||
from .ListProxy import _ListProxy | |||
from .utils import _SliceNormalizerMixIn, inheritdoc | |||
from ..compat import py3k | |||
class SmartList(_SliceNormalizerMixIn, list): | |||
@@ -54,14 +51,14 @@ class SmartList(_SliceNormalizerMixIn, list): | |||
def __init__(self, iterable=None): | |||
if iterable: | |||
super(SmartList, self).__init__(iterable) | |||
super().__init__(iterable) | |||
else: | |||
super(SmartList, self).__init__() | |||
super().__init__() | |||
self._children = {} | |||
def __getitem__(self, key): | |||
if not isinstance(key, slice): | |||
return super(SmartList, self).__getitem__(key) | |||
return super().__getitem__(key) | |||
key = self._normalize_slice(key, clamp=False) | |||
sliceinfo = [key.start, key.stop, key.step] | |||
child = _ListProxy(self, sliceinfo) | |||
@@ -71,44 +68,32 @@ class SmartList(_SliceNormalizerMixIn, list): | |||
def __setitem__(self, key, item): | |||
if not isinstance(key, slice): | |||
return super(SmartList, self).__setitem__(key, item) | |||
return super().__setitem__(key, item) | |||
item = list(item) | |||
super(SmartList, self).__setitem__(key, item) | |||
super().__setitem__(key, item) | |||
key = self._normalize_slice(key, clamp=True) | |||
diff = len(item) + (key.start - key.stop) // key.step | |||
if not diff: | |||
return | |||
values = self._children.values if py3k else self._children.itervalues | |||
for child, (start, stop, step) in values(): | |||
for child, (start, stop, step) in self._children.values(): | |||
if start > key.stop: | |||
self._children[id(child)][1][0] += diff | |||
if stop is not None and stop >= key.stop: | |||
self._children[id(child)][1][1] += diff | |||
def __delitem__(self, key): | |||
super(SmartList, self).__delitem__(key) | |||
super().__delitem__(key) | |||
if isinstance(key, slice): | |||
key = self._normalize_slice(key, clamp=True) | |||
else: | |||
key = slice(key, key + 1, 1) | |||
diff = (key.stop - key.start) // key.step | |||
values = self._children.values if py3k else self._children.itervalues | |||
for child, (start, stop, step) in values(): | |||
for child, (start, stop, step) in self._children.values(): | |||
if start > key.start: | |||
self._children[id(child)][1][0] -= diff | |||
if stop is not None and stop >= key.stop: | |||
self._children[id(child)][1][1] -= diff | |||
if not py3k: | |||
def __getslice__(self, start, stop): | |||
return self.__getitem__(slice(start, stop)) | |||
def __setslice__(self, start, stop, iterable): | |||
self.__setitem__(slice(start, stop), iterable) | |||
def __delslice__(self, start, stop): | |||
self.__delitem__(slice(start, stop)) | |||
def __add__(self, other): | |||
return SmartList(list(self) + other) | |||
@@ -159,27 +144,14 @@ class SmartList(_SliceNormalizerMixIn, list): | |||
@inheritdoc | |||
def reverse(self): | |||
self._detach_children() | |||
super(SmartList, self).reverse() | |||
if py3k: | |||
@inheritdoc | |||
def sort(self, key=None, reverse=None): | |||
self._detach_children() | |||
kwargs = {} | |||
if key is not None: | |||
kwargs["key"] = key | |||
if reverse is not None: | |||
kwargs["reverse"] = reverse | |||
super(SmartList, self).sort(**kwargs) | |||
else: | |||
@inheritdoc | |||
def sort(self, cmp=None, key=None, reverse=None): | |||
self._detach_children() | |||
kwargs = {} | |||
if cmp is not None: | |||
kwargs["cmp"] = cmp | |||
if key is not None: | |||
kwargs["key"] = key | |||
if reverse is not None: | |||
kwargs["reverse"] = reverse | |||
super(SmartList, self).sort(**kwargs) | |||
super().reverse() | |||
@inheritdoc | |||
def sort(self, key=None, reverse=None): | |||
self._detach_children() | |||
kwargs = {} | |||
if key is not None: | |||
kwargs["key"] = key | |||
if reverse is not None: | |||
kwargs["reverse"] = reverse | |||
super().sort(**kwargs) |
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com> | |||
@@ -1,5 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2019-2020 Yuri Astrakhan <YuriAstrakhan@gmail.com> | |||
# | |||
@@ -21,8 +19,6 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from sys import maxsize | |||
__all__ = [] | |||
@@ -38,7 +34,7 @@ def inheritdoc(method): | |||
return method | |||
class _SliceNormalizerMixIn(object): | |||
class _SliceNormalizerMixIn: | |||
"""MixIn that provides a private method to normalize slices.""" | |||
def _normalize_slice(self, key, clamp=False): | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -22,14 +21,11 @@ | |||
""" | |||
This module contains the :class:`.StringMixIn` type, which implements the | |||
interface for the ``unicode`` type (``str`` on py3k) in a dynamic manner. | |||
interface for the ``str`` type in a dynamic manner. | |||
""" | |||
from __future__ import unicode_literals | |||
from sys import getdefaultencoding | |||
from .compat import bytes, py3k, str | |||
__all__ = ["StringMixIn"] | |||
def inheritdoc(method): | |||
@@ -41,24 +37,20 @@ def inheritdoc(method): | |||
method.__doc__ = getattr(str, method.__name__).__doc__ | |||
return method | |||
class StringMixIn(object): | |||
class StringMixIn: | |||
"""Implement the interface for ``unicode``/``str`` in a dynamic manner. | |||
To use this class, inherit from it and override the :meth:`__unicode__` | |||
method (same on py3k) to return the string representation of the object. | |||
method to return the string representation of the object. | |||
The various string methods will operate on the value of :meth:`__unicode__` | |||
instead of the immutable ``self`` like the regular ``str`` type. | |||
""" | |||
if py3k: | |||
def __str__(self): | |||
return self.__unicode__() | |||
def __str__(self): | |||
return self.__unicode__() | |||
def __bytes__(self): | |||
return bytes(self.__unicode__(), getdefaultencoding()) | |||
else: | |||
def __str__(self): | |||
return bytes(self.__unicode__()) | |||
def __bytes__(self): | |||
return bytes(self.__unicode__(), getdefaultencoding()) | |||
def __unicode__(self): | |||
raise NotImplementedError() | |||
@@ -84,19 +76,14 @@ class StringMixIn(object): | |||
def __ge__(self, other): | |||
return self.__unicode__() >= other | |||
if py3k: | |||
def __bool__(self): | |||
return bool(self.__unicode__()) | |||
else: | |||
def __nonzero__(self): | |||
return bool(self.__unicode__()) | |||
def __bool__(self): | |||
return bool(self.__unicode__()) | |||
def __len__(self): | |||
return len(self.__unicode__()) | |||
def __iter__(self): | |||
for char in self.__unicode__(): | |||
yield char | |||
yield from self.__unicode__() | |||
def __getitem__(self, key): | |||
return self.__unicode__()[key] | |||
@@ -113,8 +100,7 @@ class StringMixIn(object): | |||
type(self).__name__, attr)) | |||
return getattr(self.__unicode__(), attr) | |||
if py3k: | |||
maketrans = str.maketrans # Static method can't rely on __getattr__ | |||
maketrans = str.maketrans # Static method can't rely on __getattr__ | |||
del inheritdoc |
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -25,9 +24,7 @@ This module contains accessory functions for other parts of the library. Parser | |||
users generally won't need stuff from here. | |||
""" | |||
from __future__ import unicode_literals | |||
from .compat import bytes, str | |||
from .nodes import Node | |||
from .smart_list import SmartList | |||
@@ -1,4 +1,3 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2019 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -20,12 +19,9 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
import re | |||
from itertools import chain | |||
from .compat import bytes, py3k, range, str | |||
from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, | |||
Node, Tag, Template, Text, Wikilink) | |||
from .smart_list.ListProxy import _ListProxy | |||
@@ -49,7 +45,7 @@ class Wikicode(StringMixIn): | |||
RECURSE_OTHERS = 2 | |||
def __init__(self, nodes): | |||
super(Wikicode, self).__init__() | |||
super().__init__() | |||
self._nodes = nodes | |||
def __unicode__(self): | |||
@@ -64,8 +60,7 @@ class Wikicode(StringMixIn): | |||
for code in node.__children__(): | |||
for child in code.nodes: | |||
sub = Wikicode._get_children(child, contexts, restrict, code) | |||
for result in sub: | |||
yield result | |||
yield from sub | |||
@staticmethod | |||
def _slice_replace(code, index, old, new): | |||
@@ -253,7 +248,7 @@ class Wikicode(StringMixIn): | |||
self.ifilter(forcetype=ftype, *a, **kw)) | |||
make_filter = lambda ftype: (lambda self, *a, **kw: | |||
self.filter(forcetype=ftype, *a, **kw)) | |||
for name, ftype in (meths.items() if py3k else meths.iteritems()): | |||
for name, ftype in meths.items(): | |||
ifilter = make_ifilter(ftype) | |||
filter = make_filter(ftype) | |||
ifilter.__doc__ = doc.format(name, "ifilter", ftype) | |||
@@ -40,7 +40,6 @@ import sys | |||
import psutil | |||
from mwparserfromhell.compat import py3k | |||
from mwparserfromhell.parser._tokenizer import CTokenizer | |||
if sys.version_info[0] == 2: | |||
@@ -88,8 +87,6 @@ class MemoryTest(object): | |||
def load_file(filename): | |||
with open(filename, "rU") as fp: | |||
text = fp.read() | |||
if not py3k: | |||
text = text.decode("utf8") | |||
name = path.split(filename)[1][:0-len(extension)] | |||
self._parse_file(name, text) | |||
@@ -1,5 +1,4 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
@@ -21,23 +20,20 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import print_function | |||
from distutils.errors import DistutilsError, CCompilerError | |||
from glob import glob | |||
from os import environ | |||
import sys | |||
if ((sys.version_info[0] == 2 and sys.version_info[1] < 7) or | |||
(sys.version_info[1] == 3 and sys.version_info[1] < 4)): | |||
raise RuntimeError("mwparserfromhell needs Python 2.7 or 3.4+") | |||
if sys.version_info[1] == 3 and sys.version_info[1] < 4: | |||
raise RuntimeError("mwparserfromhell needs 3.4+") | |||
from setuptools import setup, find_packages, Extension | |||
from setuptools.command.build_ext import build_ext | |||
from mwparserfromhell import __version__ | |||
from mwparserfromhell.compat import py3k | |||
with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp: | |||
with open("README.rst", encoding='utf-8') as fp: | |||
long_docs = fp.read() | |||
use_extension = True | |||