Browse Source

Autoformat: black + clang-format + clang-tidy

tags/v0.6.3
Ben Kurtovic 2 years ago
parent
commit
8cd0bdb322
68 changed files with 4288 additions and 2698 deletions
  1. +13
    -0
      .clang-format
  2. +1
    -0
      .gitignore
  3. +11
    -6
      scripts/memtest.py
  4. +32
    -23
      setup.py
  5. +1
    -2
      src/mwparserfromhell/__init__.py
  6. +16
    -4
      src/mwparserfromhell/definitions.py
  7. +12
    -2
      src/mwparserfromhell/nodes/__init__.py
  8. +2
    -0
      src/mwparserfromhell/nodes/_base.py
  9. +1
    -0
      src/mwparserfromhell/nodes/argument.py
  10. +1
    -0
      src/mwparserfromhell/nodes/comment.py
  11. +2
    -0
      src/mwparserfromhell/nodes/external_link.py
  12. +10
    -2
      src/mwparserfromhell/nodes/extras/attribute.py
  13. +2
    -2
      src/mwparserfromhell/nodes/extras/parameter.py
  14. +1
    -0
      src/mwparserfromhell/nodes/heading.py
  15. +12
    -5
      src/mwparserfromhell/nodes/html_entity.py
  16. +32
    -8
      src/mwparserfromhell/nodes/tag.py
  17. +7
    -6
      src/mwparserfromhell/nodes/template.py
  18. +1
    -0
      src/mwparserfromhell/nodes/text.py
  19. +1
    -0
      src/mwparserfromhell/nodes/wikilink.py
  20. +5
    -0
      src/mwparserfromhell/parser/__init__.py
  21. +57
    -20
      src/mwparserfromhell/parser/builder.py
  22. +59
    -34
      src/mwparserfromhell/parser/contexts.py
  23. +525
    -510
      src/mwparserfromhell/parser/ctokenizer/avl_tree.c
  24. +166
    -177
      src/mwparserfromhell/parser/ctokenizer/avl_tree.h
  25. +34
    -33
      src/mwparserfromhell/parser/ctokenizer/common.h
  26. +11
    -5
      src/mwparserfromhell/parser/ctokenizer/contexts.h
  27. +31
    -23
      src/mwparserfromhell/parser/ctokenizer/definitions.c
  28. +5
    -6
      src/mwparserfromhell/parser/ctokenizer/definitions.h
  29. +20
    -14
      src/mwparserfromhell/parser/ctokenizer/tag_data.c
  30. +6
    -6
      src/mwparserfromhell/parser/ctokenizer/tag_data.h
  31. +45
    -26
      src/mwparserfromhell/parser/ctokenizer/textbuffer.c
  32. +8
    -8
      src/mwparserfromhell/parser/ctokenizer/textbuffer.h
  33. +966
    -685
      src/mwparserfromhell/parser/ctokenizer/tok_parse.c
  34. +4
    -3
      src/mwparserfromhell/parser/ctokenizer/tok_parse.h
  35. +122
    -79
      src/mwparserfromhell/parser/ctokenizer/tok_support.c
  36. +28
    -31
      src/mwparserfromhell/parser/ctokenizer/tok_support.h
  37. +94
    -67
      src/mwparserfromhell/parser/ctokenizer/tokenizer.c
  38. +56
    -47
      src/mwparserfromhell/parser/ctokenizer/tokenizer.h
  39. +42
    -44
      src/mwparserfromhell/parser/ctokenizer/tokens.c
  40. +38
    -38
      src/mwparserfromhell/parser/ctokenizer/tokens.h
  41. +2
    -0
      src/mwparserfromhell/parser/errors.py
  42. +126
    -52
      src/mwparserfromhell/parser/tokenizer.py
  43. +30
    -28
      src/mwparserfromhell/parser/tokens.py
  44. +4
    -4
      src/mwparserfromhell/smart_list/list_proxy.py
  45. +5
    -2
      src/mwparserfromhell/string_mixin.py
  46. +5
    -2
      src/mwparserfromhell/utils.py
  47. +57
    -22
      src/mwparserfromhell/wikicode.py
  48. +22
    -2
      tests/conftest.py
  49. +16
    -2
      tests/test_argument.py
  50. +5
    -0
      tests/test_attribute.py
  51. +737
    -326
      tests/test_builder.py
  52. +5
    -0
      tests/test_comment.py
  53. +12
    -6
      tests/test_docs.py
  54. +12
    -7
      tests/test_external_link.py
  55. +7
    -2
      tests/test_heading.py
  56. +9
    -0
      tests/test_html_entity.py
  57. +4
    -0
      tests/test_parameter.py
  58. +43
    -19
      tests/test_parser.py
  59. +27
    -2
      tests/test_smart_list.py
  60. +95
    -20
      tests/test_string_mixin.py
  61. +99
    -40
      tests/test_tag.py
  62. +267
    -140
      tests/test_template.py
  63. +5
    -0
      tests/test_text.py
  64. +26
    -14
      tests/test_tokenizer.py
  65. +9
    -5
      tests/test_tokens.py
  66. +23
    -18
      tests/test_utils.py
  67. +140
    -67
      tests/test_wikicode.py
  68. +16
    -2
      tests/test_wikilink.py

+ 13
- 0
.clang-format View File

@@ -0,0 +1,13 @@
BasedOnStyle: LLVM
AlignConsecutiveMacros: AcrossEmptyLines
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakAfterReturnType: TopLevelDefinitions
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Linux
ColumnLimit: 88
IndentPPDirectives: AfterHash
IndentWidth: 4
SpaceAfterCStyleCast: true
StatementMacros:
- PyObject_HEAD

+ 1
- 0
.gitignore View File

@@ -13,5 +13,6 @@ dist
docs/_build docs/_build
scripts/*.log scripts/*.log
htmlcov/ htmlcov/
compile_commands.json
.idea/ .idea/
.pytest_cache/ .pytest_cache/

+ 11
- 6
scripts/memtest.py View File

@@ -41,6 +41,7 @@ from mwparserfromhell.parser._tokenizer import CTokenizer


LOOPS = 10000 LOOPS = 10000



class Color: class Color:
GRAY = "\x1b[30;1m" GRAY = "\x1b[30;1m"
GREEN = "\x1b[92m" GREEN = "\x1b[92m"
@@ -63,11 +64,11 @@ class MemoryTest:
data = {"name": None, "label": None, "input": None, "output": None} data = {"name": None, "label": None, "input": None, "output": None}
for line in test.strip().splitlines(): for line in test.strip().splitlines():
if line.startswith("name:"): if line.startswith("name:"):
data["name"] = line[len("name:"):].strip()
data["name"] = line[len("name:") :].strip()
elif line.startswith("label:"): elif line.startswith("label:"):
data["label"] = line[len("label:"):].strip()
data["label"] = line[len("label:") :].strip()
elif line.startswith("input:"): elif line.startswith("input:"):
raw = line[len("input:"):].strip()
raw = line[len("input:") :].strip()
if raw[0] == '"' and raw[-1] == '"': if raw[0] == '"' and raw[-1] == '"':
raw = raw[1:-1] raw = raw[1:-1]
raw = raw.encode("raw_unicode_escape") raw = raw.encode("raw_unicode_escape")
@@ -81,7 +82,7 @@ class MemoryTest:
def load_file(filename): def load_file(filename):
with open(filename, "rU") as fp: with open(filename, "rU") as fp:
text = fp.read() text = fp.read()
name = path.split(filename)[1][:0-len(extension)]
name = path.split(filename)[1][: 0 - len(extension)]
self._parse_file(name, text) self._parse_file(name, text)


root = path.split(path.dirname(path.abspath(__file__)))[0] root = path.split(path.dirname(path.abspath(__file__)))[0]
@@ -119,8 +120,11 @@ class MemoryTest:


tmpl = "{0}[{1:03}/{2}]{3} {4}: " tmpl = "{0}[{1:03}/{2}]{3} {4}: "
for i, (name, text) in enumerate(self._tests, 1): for i, (name, text) in enumerate(self._tests, 1):
sys.stdout.write(tmpl.format(Color.GRAY, i, len(self._tests),
Color.RESET, name.ljust(width)))
sys.stdout.write(
tmpl.format(
Color.GRAY, i, len(self._tests), Color.RESET, name.ljust(width)
)
)
sys.stdout.flush() sys.stdout.flush()
parent, child = Pipe() parent, child = Pipe()
p = Process(target=_runner, args=(text, child)) p = Process(target=_runner, args=(text, child))
@@ -156,6 +160,7 @@ def _runner(text, child):
child.send("OK") child.send("OK")
child.recv() child.recv()



if __name__ == "__main__": if __name__ == "__main__":
setlocale(LC_ALL, "") setlocale(LC_ALL, "")
MemoryTest().run() MemoryTest().run()

+ 32
- 23
setup.py View File

@@ -52,8 +52,10 @@ elif env_var is not None:


# Remove the command line argument as it isn't understood by setuptools: # Remove the command line argument as it isn't understood by setuptools:


sys.argv = [arg for arg in sys.argv
if arg not in ("--without-extension", "--with-extension")]
sys.argv = [
arg for arg in sys.argv if arg not in ("--without-extension", "--with-extension")
]



def build_ext_patched(self): def build_ext_patched(self):
try: try:
@@ -63,33 +65,40 @@ def build_ext_patched(self):
print("Falling back to pure Python mode.") print("Falling back to pure Python mode.")
del self.extensions[:] del self.extensions[:]



if fallback: if fallback:
build_ext.run, build_ext_original = build_ext_patched, build_ext.run build_ext.run, build_ext_original = build_ext_patched, build_ext.run


# Project-specific part begins here: # Project-specific part begins here:


tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.c")),
depends=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.h")))
tokenizer = Extension(
"mwparserfromhell.parser._tokenizer",
sources=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.c")),
depends=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.h")),
)


setup( setup(
name = "mwparserfromhell",
packages = find_packages("src"),
package_dir = {"": "src"},
ext_modules = [tokenizer] if use_extension else [],
setup_requires = ["pytest-runner"] if "test" in sys.argv or "pytest" in sys.argv else [],
tests_require = ["pytest"],
version = __version__,
python_requires = ">= 3.5",
author = "Ben Kurtovic",
author_email = "ben.kurtovic@gmail.com",
url = "https://github.com/earwig/mwparserfromhell",
description = "MWParserFromHell is a parser for MediaWiki wikicode.",
long_description = long_docs,
download_url = "https://github.com/earwig/mwparserfromhell/tarball/v{}".format(__version__),
keywords = "earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing",
license = "MIT License",
classifiers = [
name="mwparserfromhell",
packages=find_packages("src"),
package_dir={"": "src"},
ext_modules=[tokenizer] if use_extension else [],
setup_requires=["pytest-runner"]
if "test" in sys.argv or "pytest" in sys.argv
else [],
tests_require=["pytest"],
version=__version__,
python_requires=">= 3.5",
author="Ben Kurtovic",
author_email="ben.kurtovic@gmail.com",
url="https://github.com/earwig/mwparserfromhell",
description="MWParserFromHell is a parser for MediaWiki wikicode.",
long_description=long_docs,
download_url="https://github.com/earwig/mwparserfromhell/tarball/v{}".format(
__version__
),
keywords="earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing",
license="MIT License",
classifiers=[
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",
"Environment :: Console", "Environment :: Console",
"Intended Audience :: Developers", "Intended Audience :: Developers",
@@ -101,6 +110,6 @@ setup(
"Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Topic :: Text Processing :: Markup"
"Topic :: Text Processing :: Markup",
], ],
) )

+ 1
- 2
src/mwparserfromhell/__init__.py View File

@@ -30,7 +30,6 @@ __license__ = "MIT License"
__version__ = "0.7.dev0" __version__ = "0.7.dev0"
__email__ = "ben.kurtovic@gmail.com" __email__ = "ben.kurtovic@gmail.com"


from . import (definitions, nodes, parser, smart_list, string_mixin,
utils, wikicode)
from . import definitions, nodes, parser, smart_list, string_mixin, utils, wikicode


parse = utils.parse_anything parse = utils.parse_anything

+ 16
- 4
src/mwparserfromhell/definitions.py View File

@@ -26,8 +26,14 @@ When updating this file, please also update the the C tokenizer version:
- mwparserfromhell/parser/ctokenizer/definitions.h - mwparserfromhell/parser/ctokenizer/definitions.h
""" """


__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only", "is_scheme"]
__all__ = [
"get_html_tag",
"is_parsable",
"is_visible",
"is_single",
"is_single_only",
"is_scheme",
]


URI_SCHEMES = { URI_SCHEMES = {
# [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0 # [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0
@@ -92,7 +98,7 @@ INVISIBLE_TAGS = [
"score", "score",
"section", "section",
"templatedata", "templatedata",
"timeline"
"timeline",
] ]


# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645 # [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645
@@ -103,29 +109,35 @@ MARKUP_TO_HTML = {
"#": "li", "#": "li",
"*": "li", "*": "li",
";": "dt", ";": "dt",
":": "dd"
":": "dd",
} }



def get_html_tag(markup): def get_html_tag(markup):
"""Return the HTML tag associated with the given wiki-markup.""" """Return the HTML tag associated with the given wiki-markup."""
return MARKUP_TO_HTML[markup] return MARKUP_TO_HTML[markup]



def is_parsable(tag): def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser.""" """Return if the given *tag*'s contents should be passed to the parser."""
return tag.lower() not in PARSER_BLACKLIST return tag.lower() not in PARSER_BLACKLIST



def is_visible(tag): def is_visible(tag):
"""Return whether or not the given *tag* contains visible text.""" """Return whether or not the given *tag* contains visible text."""
return tag.lower() not in INVISIBLE_TAGS return tag.lower() not in INVISIBLE_TAGS



def is_single(tag): def is_single(tag):
"""Return whether or not the given *tag* can exist without a close tag.""" """Return whether or not the given *tag* can exist without a close tag."""
return tag.lower() in SINGLE return tag.lower() in SINGLE



def is_single_only(tag): def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag.""" """Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY return tag.lower() in SINGLE_ONLY



def is_scheme(scheme, slashes=True): def is_scheme(scheme, slashes=True):
"""Return whether *scheme* is valid for external links.""" """Return whether *scheme* is valid for external links."""
scheme = scheme.lower() scheme = scheme.lower()


+ 12
- 2
src/mwparserfromhell/nodes/__init__.py View File

@@ -39,5 +39,15 @@ from .tag import Tag
from .template import Template from .template import Template
from .wikilink import Wikilink from .wikilink import Wikilink


__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading",
"Node", "Tag", "Template", "Text", "Wikilink"]
__all__ = [
"Argument",
"Comment",
"ExternalLink",
"HTMLEntity",
"Heading",
"Node",
"Tag",
"Template",
"Text",
"Wikilink",
]

+ 2
- 0
src/mwparserfromhell/nodes/_base.py View File

@@ -22,6 +22,7 @@ from ..string_mixin import StringMixIn


__all__ = ["Node"] __all__ = ["Node"]



class Node(StringMixIn): class Node(StringMixIn):
"""Represents the base Node type, demonstrating the methods to override. """Represents the base Node type, demonstrating the methods to override.


@@ -35,6 +36,7 @@ class Node(StringMixIn):
:meth:`__showtree__` can be overridden to build a nice tree representation :meth:`__showtree__` can be overridden to build a nice tree representation
of the node, if desired, for :meth:`~.Wikicode.get_tree`. of the node, if desired, for :meth:`~.Wikicode.get_tree`.
""" """

def __str__(self): def __str__(self):
raise NotImplementedError() raise NotImplementedError()




+ 1
- 0
src/mwparserfromhell/nodes/argument.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything


__all__ = ["Argument"] __all__ = ["Argument"]



class Argument(Node): class Argument(Node):
"""Represents a template argument substitution, like ``{{{foo}}}``.""" """Represents a template argument substitution, like ``{{{foo}}}``."""




+ 1
- 0
src/mwparserfromhell/nodes/comment.py View File

@@ -23,6 +23,7 @@ from ._base import Node


__all__ = ["Comment"] __all__ = ["Comment"]



class Comment(Node): class Comment(Node):
"""Represents a hidden HTML comment, like ``<!-- foobar -->``.""" """Represents a hidden HTML comment, like ``<!-- foobar -->``."""




+ 2
- 0
src/mwparserfromhell/nodes/external_link.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything


__all__ = ["ExternalLink"] __all__ = ["ExternalLink"]



class ExternalLink(Node): class ExternalLink(Node):
"""Represents an external link, like ``[http://example.com/ Example]``.""" """Represents an external link, like ``[http://example.com/ Example]``."""


@@ -83,6 +84,7 @@ class ExternalLink(Node):
def url(self, value): def url(self, value):
# pylint: disable=import-outside-toplevel # pylint: disable=import-outside-toplevel
from ..parser import contexts from ..parser import contexts

self._url = parse_anything(value, contexts.EXT_LINK_URI) self._url = parse_anything(value, contexts.EXT_LINK_URI)


@title.setter @title.setter


+ 10
- 2
src/mwparserfromhell/nodes/extras/attribute.py View File

@@ -24,6 +24,7 @@ from ...utils import parse_anything


__all__ = ["Attribute"] __all__ = ["Attribute"]



class Attribute(StringMixIn): class Attribute(StringMixIn):
"""Represents an attribute of an HTML tag. """Represents an attribute of an HTML tag.


@@ -32,8 +33,15 @@ class Attribute(StringMixIn):
whose value is ``"foo"``. whose value is ``"foo"``.
""" """


def __init__(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
def __init__(
self,
name,
value=None,
quotes='"',
pad_first=" ",
pad_before_eq="",
pad_after_eq="",
):
super().__init__() super().__init__()
self.name = name self.name = name
self._quotes = None self._quotes = None


+ 2
- 2
src/mwparserfromhell/nodes/extras/parameter.py View File

@@ -25,6 +25,7 @@ from ...utils import parse_anything


__all__ = ["Parameter"] __all__ = ["Parameter"]



class Parameter(StringMixIn): class Parameter(StringMixIn):
"""Represents a paramater of a template. """Represents a paramater of a template.


@@ -77,6 +78,5 @@ class Parameter(StringMixIn):
def showkey(self, newval): def showkey(self, newval):
newval = bool(newval) newval = bool(newval)
if not newval and not self.can_hide_key(self.name): if not newval and not self.can_hide_key(self.name):
raise ValueError("parameter key {!r} cannot be hidden".format(
self.name))
raise ValueError("parameter key {!r} cannot be hidden".format(self.name))
self._showkey = newval self._showkey = newval

+ 1
- 0
src/mwparserfromhell/nodes/heading.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything


__all__ = ["Heading"] __all__ = ["Heading"]



class Heading(Node): class Heading(Node):
"""Represents a section heading in wikicode, like ``== Foo ==``.""" """Represents a section heading in wikicode, like ``== Foo ==``."""




+ 12
- 5
src/mwparserfromhell/nodes/html_entity.py View File

@@ -24,6 +24,7 @@ from ._base import Node


__all__ = ["HTMLEntity"] __all__ = ["HTMLEntity"]



class HTMLEntity(Node): class HTMLEntity(Node):
"""Represents an HTML entity, like ``&nbsp;``, either named or unnamed.""" """Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""


@@ -101,19 +102,23 @@ class HTMLEntity(Node):
except ValueError: except ValueError:
if newval not in htmlentities.entitydefs: if newval not in htmlentities.entitydefs:
raise ValueError( raise ValueError(
"entity value {!r} is not a valid name".format(newval)) from None
"entity value {!r} is not a valid name".format(newval)
) from None
self._named = True self._named = True
self._hexadecimal = False self._hexadecimal = False
else: else:
if intval < 0 or intval > 0x10FFFF: if intval < 0 or intval > 0x10FFFF:
raise ValueError( raise ValueError(
"entity value 0x{:x} is not in range(0x110000)".format(intval)) from None
"entity value 0x{:x} is not in range(0x110000)".format(intval)
) from None
self._named = False self._named = False
self._hexadecimal = True self._hexadecimal = True
else: else:
test = int(newval, 16 if self.hexadecimal else 10) test = int(newval, 16 if self.hexadecimal else 10)
if test < 0 or test > 0x10FFFF: if test < 0 or test > 0x10FFFF:
raise ValueError("entity value {} is not in range(0x110000)".format(test))
raise ValueError(
"entity value {} is not in range(0x110000)".format(test)
)
self._named = False self._named = False
self._value = newval self._value = newval


@@ -126,8 +131,10 @@ class HTMLEntity(Node):
try: try:
int(self.value, 16) int(self.value, 16)
except ValueError as exc: except ValueError as exc:
raise ValueError("current entity value {!r} is not a valid "
"Unicode codepoint".format(self.value)) from exc
raise ValueError(
"current entity value {!r} is not a valid "
"Unicode codepoint".format(self.value)
) from exc
self._named = newval self._named = newval


@hexadecimal.setter @hexadecimal.setter


+ 32
- 8
src/mwparserfromhell/nodes/tag.py View File

@@ -26,13 +26,24 @@ from ..utils import parse_anything


__all__ = ["Tag"] __all__ = ["Tag"]



class Tag(Node): class Tag(Node):
"""Represents an HTML-style tag in wikicode, like ``<ref>``.""" """Represents an HTML-style tag in wikicode, like ``<ref>``."""


def __init__(self, tag, contents=None, attrs=None, wiki_markup=None,
self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None, wiki_style_separator=None,
closing_wiki_markup=None):
def __init__(
self,
tag,
contents=None,
attrs=None,
wiki_markup=None,
self_closing=False,
invalid=False,
implicit=False,
padding="",
closing_tag=None,
wiki_style_separator=None,
closing_wiki_markup=None,
):
super().__init__() super().__init__()
self.tag = tag self.tag = tag
self.contents = contents self.contents = contents
@@ -60,8 +71,14 @@ class Tag(Node):
if self.self_closing: if self.self_closing:
return self.wiki_markup + attrs + padding + separator return self.wiki_markup + attrs + padding + separator
close = self.closing_wiki_markup or "" close = self.closing_wiki_markup or ""
return self.wiki_markup + attrs + padding + separator + \
str(self.contents) + close
return (
self.wiki_markup
+ attrs
+ padding
+ separator
+ str(self.contents)
+ close
)


result = ("</" if self.invalid else "<") + str(self.tag) result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes: if self.attributes:
@@ -270,8 +287,15 @@ class Tag(Node):
return attr return attr
raise ValueError(name) raise ValueError(name)


def add(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
def add(
self,
name,
value=None,
quotes='"',
pad_first=" ",
pad_before_eq="",
pad_after_eq="",
):
"""Add an attribute with the given *name* and *value*. """Add an attribute with the given *name* and *value*.


*name* and *value* can be anything parsable by *name* and *value* can be anything parsable by


+ 7
- 6
src/mwparserfromhell/nodes/template.py View File

@@ -33,6 +33,7 @@ FLAGS = re.DOTALL | re.UNICODE
# Used to allow None as a valid fallback value # Used to allow None as a valid fallback value
_UNSET = object() _UNSET = object()



class Template(Node): class Template(Node):
"""Represents a template in wikicode, like ``{{foo}}``.""" """Represents a template in wikicode, like ``{{foo}}``."""


@@ -153,7 +154,7 @@ class Template(Node):
def _fix_dependendent_params(self, i): def _fix_dependendent_params(self, i):
"""Unhide keys if necessary after removing the param at index *i*.""" """Unhide keys if necessary after removing the param at index *i*."""
if not self.params[i].showkey: if not self.params[i].showkey:
for param in self.params[i + 1:]:
for param in self.params[i + 1 :]:
if not param.showkey: if not param.showkey:
param.showkey = True param.showkey = True


@@ -175,9 +176,10 @@ class Template(Node):
If one exists, we should remove the given one rather than blanking it. If one exists, we should remove the given one rather than blanking it.
""" """
if self.params[i].showkey: if self.params[i].showkey:
following = self.params[i + 1:]
better_matches = [after.name.strip() == name and not after.showkey
for after in following]
following = self.params[i + 1 :]
better_matches = [
after.name.strip() == name and not after.showkey for after in following
]
return any(better_matches) return any(better_matches)
return False return False


@@ -235,8 +237,7 @@ class Template(Node):
def __getitem__(self, name): def __getitem__(self, name):
return self.get(name) return self.get(name)


def add(self, name, value, showkey=None, before=None,
preserve_spacing=True):
def add(self, name, value, showkey=None, before=None, preserve_spacing=True):
"""Add a parameter to the template with a given *name* and *value*. """Add a parameter to the template with a given *name* and *value*.


*name* and *value* can be anything parsable by *name* and *value* can be anything parsable by


+ 1
- 0
src/mwparserfromhell/nodes/text.py View File

@@ -23,6 +23,7 @@ from ._base import Node


__all__ = ["Text"] __all__ = ["Text"]



class Text(Node): class Text(Node):
"""Represents ordinary, unformatted text with no special properties.""" """Represents ordinary, unformatted text with no special properties."""




+ 1
- 0
src/mwparserfromhell/nodes/wikilink.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything


__all__ = ["Wikilink"] __all__ = ["Wikilink"]



class Wikilink(Node): class Wikilink(Node):
"""Represents an internal wikilink, like ``[[Foo|Bar]]``.""" """Represents an internal wikilink, like ``[[Foo|Bar]]``."""




+ 5
- 0
src/mwparserfromhell/parser/__init__.py View File

@@ -26,16 +26,20 @@ together into one interface.


from .builder import Builder from .builder import Builder
from .errors import ParserError from .errors import ParserError

try: try:
from ._tokenizer import CTokenizer from ._tokenizer import CTokenizer

use_c = True use_c = True
except ImportError: except ImportError:
from .tokenizer import Tokenizer from .tokenizer import Tokenizer

CTokenizer = None CTokenizer = None
use_c = False use_c = False


__all__ = ["use_c", "Parser", "ParserError"] __all__ = ["use_c", "Parser", "ParserError"]



class Parser: class Parser:
"""Represents a parser for wikicode. """Represents a parser for wikicode.


@@ -57,6 +61,7 @@ class Parser:
self._tokenizer = CTokenizer() self._tokenizer = CTokenizer()
else: else:
from .tokenizer import Tokenizer from .tokenizer import Tokenizer

self._tokenizer = Tokenizer() self._tokenizer = Tokenizer()
self._builder = Builder() self._builder = Builder()




+ 57
- 20
src/mwparserfromhell/parser/builder.py View File

@@ -21,24 +21,34 @@


from . import tokens from . import tokens
from .errors import ParserError from .errors import ParserError
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
Template, Text, Wikilink)
from ..nodes import (
Argument,
Comment,
ExternalLink,
Heading,
HTMLEntity,
Tag,
Template,
Text,
Wikilink,
)
from ..nodes.extras import Attribute, Parameter from ..nodes.extras import Attribute, Parameter
from ..smart_list import SmartList from ..smart_list import SmartList
from ..wikicode import Wikicode from ..wikicode import Wikicode


__all__ = ["Builder"] __all__ = ["Builder"]


_HANDLERS = {
tokens.Text: lambda self, token: Text(token.text)
}
_HANDLERS = {tokens.Text: lambda self, token: Text(token.text)}



def _add_handler(token_type): def _add_handler(token_type):
"""Create a decorator that adds a handler function to the lookup table.""" """Create a decorator that adds a handler function to the lookup table."""

def decorator(func): def decorator(func):
"""Add a handler function to the lookup table.""" """Add a handler function to the lookup table."""
_HANDLERS[token_type] = func _HANDLERS[token_type] = func
return func return func

return decorator return decorator




@@ -84,8 +94,9 @@ class Builder:
key = self._pop() key = self._pop()
showkey = True showkey = True
self._push() self._push()
elif isinstance(token, (tokens.TemplateParamSeparator,
tokens.TemplateClose)):
elif isinstance(
token, (tokens.TemplateParamSeparator, tokens.TemplateClose)
):
self._tokens.append(token) self._tokens.append(token)
value = self._pop() value = self._pop()
if key is None: if key is None:
@@ -167,10 +178,17 @@ class Builder:
self._push() self._push()
elif isinstance(token, tokens.ExternalLinkClose): elif isinstance(token, tokens.ExternalLinkClose):
if url is not None: if url is not None:
return ExternalLink(url, self._pop(), brackets=brackets,
suppress_space=suppress_space is True)
return ExternalLink(self._pop(), brackets=brackets,
suppress_space=suppress_space is True)
return ExternalLink(
url,
self._pop(),
brackets=brackets,
suppress_space=suppress_space is True,
)
return ExternalLink(
self._pop(),
brackets=brackets,
suppress_space=suppress_space is True,
)
else: else:
self._write(self._handle_token(token)) self._write(self._handle_token(token))
raise ParserError("_handle_external_link() missed a close token") raise ParserError("_handle_external_link() missed a close token")
@@ -184,8 +202,9 @@ class Builder:
if isinstance(token, tokens.HTMLEntityHex): if isinstance(token, tokens.HTMLEntityHex):
text = self._tokens.pop() text = self._tokens.pop()
self._tokens.pop() # Remove HTMLEntityEnd self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(text.text, named=False, hexadecimal=True,
hex_char=token.char)
return HTMLEntity(
text.text, named=False, hexadecimal=True, hex_char=token.char
)
self._tokens.pop() # Remove HTMLEntityEnd self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=False, hexadecimal=False) return HTMLEntity(token.text, named=False, hexadecimal=False)
self._tokens.pop() # Remove HTMLEntityEnd self._tokens.pop() # Remove HTMLEntityEnd
@@ -227,15 +246,23 @@ class Builder:
self._push() self._push()
elif isinstance(token, tokens.TagAttrQuote): elif isinstance(token, tokens.TagAttrQuote):
quotes = token.char quotes = token.char
elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
tokens.TagCloseSelfclose)):
elif isinstance(
token,
(tokens.TagAttrStart, tokens.TagCloseOpen, tokens.TagCloseSelfclose),
):
self._tokens.append(token) self._tokens.append(token)
if name: if name:
value = self._pop() value = self._pop()
else: else:
name, value = self._pop(), None name, value = self._pop(), None
return Attribute(name, value, quotes, start.pad_first,
start.pad_before_eq, start.pad_after_eq)
return Attribute(
name,
value,
quotes,
start.pad_first,
start.pad_before_eq,
start.pad_after_eq,
)
else: else:
self._write(self._handle_token(token)) self._write(self._handle_token(token))
raise ParserError("_handle_attribute() missed a close token") raise ParserError("_handle_attribute() missed a close token")
@@ -271,9 +298,19 @@ class Builder:
else: else:
self_closing = False self_closing = False
closing_tag = self._pop() closing_tag = self._pop()
return Tag(tag, contents, attrs, wiki_markup, self_closing,
invalid, implicit, padding, closing_tag,
wiki_style_separator, closing_wiki_markup)
return Tag(
tag,
contents,
attrs,
wiki_markup,
self_closing,
invalid,
implicit,
padding,
closing_tag,
wiki_style_separator,
closing_wiki_markup,
)
else: else:
self._write(self._handle_token(token)) self._write(self._handle_token(token))
raise ParserError("_handle_tag() missed a close token") raise ParserError("_handle_tag() missed a close token")


+ 59
- 34
src/mwparserfromhell/parser/contexts.py View File

@@ -116,21 +116,21 @@ Aggregate contexts:


# Local contexts: # Local contexts:


TEMPLATE_NAME = 1 << 0
TEMPLATE_PARAM_KEY = 1 << 1
TEMPLATE_NAME = 1 << 0
TEMPLATE_PARAM_KEY = 1 << 1
TEMPLATE_PARAM_VALUE = 1 << 2 TEMPLATE_PARAM_VALUE = 1 << 2
TEMPLATE = TEMPLATE_NAME + TEMPLATE_PARAM_KEY + TEMPLATE_PARAM_VALUE TEMPLATE = TEMPLATE_NAME + TEMPLATE_PARAM_KEY + TEMPLATE_PARAM_VALUE


ARGUMENT_NAME = 1 << 3
ARGUMENT_NAME = 1 << 3
ARGUMENT_DEFAULT = 1 << 4 ARGUMENT_DEFAULT = 1 << 4
ARGUMENT = ARGUMENT_NAME + ARGUMENT_DEFAULT ARGUMENT = ARGUMENT_NAME + ARGUMENT_DEFAULT


WIKILINK_TITLE = 1 << 5 WIKILINK_TITLE = 1 << 5
WIKILINK_TEXT = 1 << 6
WIKILINK_TEXT = 1 << 6
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT


EXT_LINK_URI = 1 << 7
EXT_LINK_TITLE = 1 << 8
EXT_LINK_URI = 1 << 7
EXT_LINK_TITLE = 1 << 8
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE


HEADING_LEVEL_1 = 1 << 9 HEADING_LEVEL_1 = 1 << 9
@@ -139,42 +139,61 @@ HEADING_LEVEL_3 = 1 << 11
HEADING_LEVEL_4 = 1 << 12 HEADING_LEVEL_4 = 1 << 12
HEADING_LEVEL_5 = 1 << 13 HEADING_LEVEL_5 = 1 << 13
HEADING_LEVEL_6 = 1 << 14 HEADING_LEVEL_6 = 1 << 14
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 +
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6)

TAG_OPEN = 1 << 15
TAG_ATTR = 1 << 16
TAG_BODY = 1 << 17
HEADING = (
HEADING_LEVEL_1
+ HEADING_LEVEL_2
+ HEADING_LEVEL_3
+ HEADING_LEVEL_4
+ HEADING_LEVEL_5
+ HEADING_LEVEL_6
)

TAG_OPEN = 1 << 15
TAG_ATTR = 1 << 16
TAG_BODY = 1 << 17
TAG_CLOSE = 1 << 18 TAG_CLOSE = 1 << 18
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE


STYLE_ITALICS = 1 << 19
STYLE_BOLD = 1 << 20
STYLE_PASS_AGAIN = 1 << 21
STYLE_SECOND_PASS = 1 << 22
STYLE_ITALICS = 1 << 19
STYLE_BOLD = 1 << 20
STYLE_PASS_AGAIN = 1 << 21
STYLE_SECOND_PASS = 1 << 22
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS


DL_TERM = 1 << 23 DL_TERM = 1 << 23


HAS_TEXT = 1 << 24
FAIL_ON_TEXT = 1 << 25
FAIL_NEXT = 1 << 26
HAS_TEXT = 1 << 24
FAIL_ON_TEXT = 1 << 25
FAIL_NEXT = 1 << 26
FAIL_ON_LBRACE = 1 << 27 FAIL_ON_LBRACE = 1 << 27
FAIL_ON_RBRACE = 1 << 28 FAIL_ON_RBRACE = 1 << 28
FAIL_ON_EQUALS = 1 << 29 FAIL_ON_EQUALS = 1 << 29
HAS_TEMPLATE = 1 << 30
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE)

TABLE_OPEN = 1 << 31
TABLE_CELL_OPEN = 1 << 32
HAS_TEMPLATE = 1 << 30
SAFETY_CHECK = (
HAS_TEXT
+ FAIL_ON_TEXT
+ FAIL_NEXT
+ FAIL_ON_LBRACE
+ FAIL_ON_RBRACE
+ FAIL_ON_EQUALS
+ HAS_TEMPLATE
)

TABLE_OPEN = 1 << 31
TABLE_CELL_OPEN = 1 << 32
TABLE_CELL_STYLE = 1 << 33 TABLE_CELL_STYLE = 1 << 33
TABLE_ROW_OPEN = 1 << 34
TABLE_TD_LINE = 1 << 35
TABLE_TH_LINE = 1 << 36
TABLE_ROW_OPEN = 1 << 34
TABLE_TD_LINE = 1 << 35
TABLE_TH_LINE = 1 << 36
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
TABLE_TD_LINE + TABLE_TH_LINE)
TABLE = (
TABLE_OPEN
+ TABLE_CELL_OPEN
+ TABLE_CELL_STYLE
+ TABLE_ROW_OPEN
+ TABLE_TD_LINE
+ TABLE_TH_LINE
)


HTML_ENTITY = 1 << 37 HTML_ENTITY = 1 << 37


@@ -184,14 +203,20 @@ GL_HEADING = 1 << 0


# Aggregate contexts: # Aggregate contexts:


FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG +
STYLE + TABLE)
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE
UNSAFE = (
TEMPLATE_NAME
+ WIKILINK_TITLE
+ EXT_LINK_TITLE
+ TEMPLATE_PARAM_KEY
+ ARGUMENT_NAME
+ TAG_CLOSE
)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK



def describe(context): def describe(context):
"""Return a string describing the given context value, for debugging.""" """Return a string describing the given context value, for debugging."""
flags = [] flags = []


+ 525
- 510
src/mwparserfromhell/parser/ctokenizer/avl_tree.c
File diff suppressed because it is too large
View File


+ 166
- 177
src/mwparserfromhell/parser/ctokenizer/avl_tree.h View File

@@ -1,6 +1,6 @@
/* /*
* avl_tree.h - intrusive, nonrecursive AVL tree data structure (self-balancing * avl_tree.h - intrusive, nonrecursive AVL tree data structure (self-balancing
* binary search tree), header file
* binary search tree), header file
* *
* Written in 2014-2016 by Eric Biggers <ebiggers3@gmail.com> * Written in 2014-2016 by Eric Biggers <ebiggers3@gmail.com>
* Slight changes for compatibility by Ben Kurtovic <ben.kurtovic@gmail.com> * Slight changes for compatibility by Ben Kurtovic <ben.kurtovic@gmail.com>
@@ -24,60 +24,60 @@
#include <stddef.h> #include <stddef.h>


#if !defined(_MSC_VER) || (_MSC_VER >= 1600) #if !defined(_MSC_VER) || (_MSC_VER >= 1600)
#include <stdint.h>
# include <stdint.h>
#endif #endif


#ifdef __GNUC__ #ifdef __GNUC__
# define AVL_INLINE inline __attribute__((always_inline))
# define AVL_INLINE inline __attribute__((always_inline))
#elif defined(_MSC_VER) && (_MSC_VER < 1900) #elif defined(_MSC_VER) && (_MSC_VER < 1900)
# define AVL_INLINE __inline
# define AVL_INLINE __inline
#else #else
# define AVL_INLINE inline
# define AVL_INLINE inline
#endif #endif


/* Node in an AVL tree. Embed this in some other data structure. */ /* Node in an AVL tree. Embed this in some other data structure. */
struct avl_tree_node { struct avl_tree_node {


/* Pointer to left child or NULL */
struct avl_tree_node *left;
/* Pointer to left child or NULL */
struct avl_tree_node *left;


/* Pointer to right child or NULL */
struct avl_tree_node *right;
/* Pointer to right child or NULL */
struct avl_tree_node *right;


/* Pointer to parent combined with the balance factor. This saves 4 or
* 8 bytes of memory depending on the CPU architecture.
*
* Low 2 bits: One greater than the balance factor of this subtree,
* which is equal to height(right) - height(left). The mapping is:
*
* 00 => -1
* 01 => 0
* 10 => +1
* 11 => undefined
*
* The rest of the bits are the pointer to the parent node. It must be
* 4-byte aligned, and it will be NULL if this is the root node and
* therefore has no parent. */
uintptr_t parent_balance;
/* Pointer to parent combined with the balance factor. This saves 4 or
* 8 bytes of memory depending on the CPU architecture.
*
* Low 2 bits: One greater than the balance factor of this subtree,
* which is equal to height(right) - height(left). The mapping is:
*
* 00 => -1
* 01 => 0
* 10 => +1
* 11 => undefined
*
* The rest of the bits are the pointer to the parent node. It must be
* 4-byte aligned, and it will be NULL if this is the root node and
* therefore has no parent. */
uintptr_t parent_balance;
}; };


/* Cast an AVL tree node to the containing data structure. */ /* Cast an AVL tree node to the containing data structure. */
#define avl_tree_entry(entry, type, member) \
((type*) ((char *)(entry) - offsetof(type, member)))
#define avl_tree_entry(entry, type, member) \
((type *) ((char *) (entry) -offsetof(type, member)))


/* Returns a pointer to the parent of the specified AVL tree node, or NULL if it /* Returns a pointer to the parent of the specified AVL tree node, or NULL if it
* is already the root of the tree. */ * is already the root of the tree. */
static AVL_INLINE struct avl_tree_node * static AVL_INLINE struct avl_tree_node *
avl_get_parent(const struct avl_tree_node *node) avl_get_parent(const struct avl_tree_node *node)
{ {
return (struct avl_tree_node *)(node->parent_balance & ~3);
return (struct avl_tree_node *) (node->parent_balance & ~3);
} }


/* Marks the specified AVL tree node as unlinked from any tree. */ /* Marks the specified AVL tree node as unlinked from any tree. */
static AVL_INLINE void static AVL_INLINE void
avl_tree_node_set_unlinked(struct avl_tree_node *node) avl_tree_node_set_unlinked(struct avl_tree_node *node)
{ {
node->parent_balance = (uintptr_t)node;
node->parent_balance = (uintptr_t) node;
} }


/* Returns true iff the specified AVL tree node has been marked with /* Returns true iff the specified AVL tree node has been marked with
@@ -86,30 +86,29 @@ avl_tree_node_set_unlinked(struct avl_tree_node *node)
static AVL_INLINE int static AVL_INLINE int
avl_tree_node_is_unlinked(const struct avl_tree_node *node) avl_tree_node_is_unlinked(const struct avl_tree_node *node)
{ {
return node->parent_balance == (uintptr_t)node;
return node->parent_balance == (uintptr_t) node;
} }


/* (Internal use only) */ /* (Internal use only) */
extern void
avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *inserted);
extern void avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *inserted);


/* /*
* Looks up an item in the specified AVL tree. * Looks up an item in the specified AVL tree.
* *
* @root * @root
* Pointer to the root of the AVL tree. (This can be NULL --- that just
* means the tree is empty.)
* Pointer to the root of the AVL tree. (This can be NULL --- that just
* means the tree is empty.)
* *
* @cmp_ctx * @cmp_ctx
* First argument to pass to the comparison callback. This generally
* should be a pointer to an object equal to the one being searched for.
* First argument to pass to the comparison callback. This generally
* should be a pointer to an object equal to the one being searched for.
* *
* @cmp * @cmp
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @cmp_ctx and the second
* argument will be a pointer to the AVL tree node of an item in the tree.
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @cmp_ctx and the second
* argument will be a pointer to the AVL tree node of an item in the tree.
* *
* Returns a pointer to the AVL tree node of the resulting item, or NULL if the * Returns a pointer to the AVL tree node of the resulting item, or NULL if the
* item was not found. * item was not found.
@@ -117,48 +116,49 @@ avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
* Example: * Example:
* *
* struct int_wrapper { * struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* }; * };
* *
* static int _avl_cmp_int_to_node(const void *intptr, * static int _avl_cmp_int_to_node(const void *intptr,
* const struct avl_tree_node *nodeptr)
* const struct avl_tree_node *nodeptr)
* { * {
* int n1 = *(const int *)intptr;
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data;
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* int n1 = *(const int *)intptr;
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data;
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* } * }
* *
* bool contains_int(struct avl_tree_node *root, int n) * bool contains_int(struct avl_tree_node *root, int n)
* { * {
* struct avl_tree_node *result;
* struct avl_tree_node *result;
* *
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node);
* return result ? true : false;
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node);
* return result ? true : false;
* } * }
*/ */
static AVL_INLINE struct avl_tree_node * static AVL_INLINE struct avl_tree_node *
avl_tree_lookup(const struct avl_tree_node *root, avl_tree_lookup(const struct avl_tree_node *root,
const void *cmp_ctx,
int (*cmp)(const void *, const struct avl_tree_node *))
const void *cmp_ctx,
int (*cmp)(const void *, const struct avl_tree_node *))
{ {
const struct avl_tree_node *cur = root;
const struct avl_tree_node *cur = root;


while (cur) {
int res = (*cmp)(cmp_ctx, cur);
if (res < 0)
cur = cur->left;
else if (res > 0)
cur = cur->right;
else
break;
}
return (struct avl_tree_node*)cur;
while (cur) {
int res = (*cmp)(cmp_ctx, cur);
if (res < 0) {
cur = cur->left;
} else if (res > 0) {
cur = cur->right;
} else {
break;
}
}
return (struct avl_tree_node *) cur;
} }


/* Same as avl_tree_lookup(), but uses a more specific type for the comparison /* Same as avl_tree_lookup(), but uses a more specific type for the comparison
@@ -167,44 +167,45 @@ avl_tree_lookup(const struct avl_tree_node *root,
* embedded 'struct avl_tree_node'. */ * embedded 'struct avl_tree_node'. */
static AVL_INLINE struct avl_tree_node * static AVL_INLINE struct avl_tree_node *
avl_tree_lookup_node(const struct avl_tree_node *root, avl_tree_lookup_node(const struct avl_tree_node *root,
const struct avl_tree_node *node,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
const struct avl_tree_node *node,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
{ {
const struct avl_tree_node *cur = root;
const struct avl_tree_node *cur = root;


while (cur) {
int res = (*cmp)(node, cur);
if (res < 0)
cur = cur->left;
else if (res > 0)
cur = cur->right;
else
break;
}
return (struct avl_tree_node*)cur;
while (cur) {
int res = (*cmp)(node, cur);
if (res < 0) {
cur = cur->left;
} else if (res > 0) {
cur = cur->right;
} else {
break;
}
}
return (struct avl_tree_node *) cur;
} }


/* /*
* Inserts an item into the specified AVL tree. * Inserts an item into the specified AVL tree.
* *
* @root_ptr * @root_ptr
* Location of the AVL tree's root pointer. Indirection is needed because
* the root node may change as a result of rotations caused by the
* insertion. Initialize *root_ptr to NULL for an empty tree.
* Location of the AVL tree's root pointer. Indirection is needed because
* the root node may change as a result of rotations caused by the
* insertion. Initialize *root_ptr to NULL for an empty tree.
* *
* @item * @item
* Pointer to the `struct avl_tree_node' embedded in the item to insert.
* No members in it need be pre-initialized, although members in the
* containing structure should be pre-initialized so that @cmp can use them
* in comparisons.
* Pointer to the `struct avl_tree_node' embedded in the item to insert.
* No members in it need be pre-initialized, although members in the
* containing structure should be pre-initialized so that @cmp can use them
* in comparisons.
* *
* @cmp * @cmp
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @item and the second
* argument will be a pointer to an AVL tree node embedded in some
* previously-inserted item to which @item is being compared.
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @item and the second
* argument will be a pointer to an AVL tree node embedded in some
* previously-inserted item to which @item is being compared.
* *
* If no item in the tree is comparatively equal (via @cmp) to @item, inserts * If no item in the tree is comparatively equal (via @cmp) to @item, inserts
* @item and returns NULL. Otherwise does nothing and returns a pointer to the * @item and returns NULL. Otherwise does nothing and returns a pointer to the
@@ -214,150 +215,138 @@ avl_tree_lookup_node(const struct avl_tree_node *root,
* Example: * Example:
* *
* struct int_wrapper { * struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* }; * };
* *
* #define GET_DATA(i) avl_tree_entry((i), struct int_wrapper, index_node)->data * #define GET_DATA(i) avl_tree_entry((i), struct int_wrapper, index_node)->data
* *
* static int _avl_cmp_ints(const struct avl_tree_node *node1, * static int _avl_cmp_ints(const struct avl_tree_node *node1,
* const struct avl_tree_node *node2)
* const struct avl_tree_node *node2)
* { * {
* int n1 = GET_DATA(node1);
* int n2 = GET_DATA(node2);
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* int n1 = GET_DATA(node1);
* int n2 = GET_DATA(node2);
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* } * }
* *
* bool insert_int(struct avl_tree_node **root_ptr, int data) * bool insert_int(struct avl_tree_node **root_ptr, int data)
* { * {
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper));
* i->data = data;
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) {
* // Duplicate.
* free(i);
* return false;
* }
* return true;
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper));
* i->data = data;
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) {
* // Duplicate.
* free(i);
* return false;
* }
* return true;
* } * }
*/ */
static AVL_INLINE struct avl_tree_node * static AVL_INLINE struct avl_tree_node *
avl_tree_insert(struct avl_tree_node **root_ptr, avl_tree_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *item,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
struct avl_tree_node *item,
int (*cmp)(const struct avl_tree_node *, const struct avl_tree_node *))
{ {
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL;
int res;
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL;
int res;


while (*cur_ptr) {
cur = *cur_ptr;
res = (*cmp)(item, cur);
if (res < 0)
cur_ptr = &cur->left;
else if (res > 0)
cur_ptr = &cur->right;
else
return cur;
}
*cur_ptr = item;
item->parent_balance = (uintptr_t)cur | 1;
avl_tree_rebalance_after_insert(root_ptr, item);
return NULL;
while (*cur_ptr) {
cur = *cur_ptr;
res = (*cmp)(item, cur);
if (res < 0) {
cur_ptr = &cur->left;
} else if (res > 0) {
cur_ptr = &cur->right;
} else {
return cur;
}
}
*cur_ptr = item;
item->parent_balance = (uintptr_t) cur | 1;
avl_tree_rebalance_after_insert(root_ptr, item);
return NULL;
} }


/* Removes an item from the specified AVL tree. /* Removes an item from the specified AVL tree.
* See implementation for details. */ * See implementation for details. */
extern void
avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node);
extern void avl_tree_remove(struct avl_tree_node **root_ptr,
struct avl_tree_node *node);


/* Nonrecursive AVL tree traversal functions */ /* Nonrecursive AVL tree traversal functions */


extern struct avl_tree_node *
avl_tree_first_in_order(const struct avl_tree_node *root);
extern struct avl_tree_node *avl_tree_first_in_order(const struct avl_tree_node *root);


extern struct avl_tree_node *
avl_tree_last_in_order(const struct avl_tree_node *root);
extern struct avl_tree_node *avl_tree_last_in_order(const struct avl_tree_node *root);


extern struct avl_tree_node *
avl_tree_next_in_order(const struct avl_tree_node *node);
extern struct avl_tree_node *avl_tree_next_in_order(const struct avl_tree_node *node);


extern struct avl_tree_node *
avl_tree_prev_in_order(const struct avl_tree_node *node);
extern struct avl_tree_node *avl_tree_prev_in_order(const struct avl_tree_node *node);


extern struct avl_tree_node * extern struct avl_tree_node *
avl_tree_first_in_postorder(const struct avl_tree_node *root); avl_tree_first_in_postorder(const struct avl_tree_node *root);


extern struct avl_tree_node * extern struct avl_tree_node *
avl_tree_next_in_postorder(const struct avl_tree_node *prev, avl_tree_next_in_postorder(const struct avl_tree_node *prev,
const struct avl_tree_node *prev_parent);
const struct avl_tree_node *prev_parent);


/* /*
* Iterate through the nodes in an AVL tree in sorted order. * Iterate through the nodes in an AVL tree in sorted order.
* You may not modify the tree during the iteration. * You may not modify the tree during the iteration.
* *
* @child_struct * @child_struct
* Variable that will receive a pointer to each struct inserted into the
* tree.
* Variable that will receive a pointer to each struct inserted into the
* tree.
* @root * @root
* Root of the AVL tree.
* Root of the AVL tree.
* @struct_name * @struct_name
* Type of *child_struct.
* Type of *child_struct.
* @struct_member * @struct_member
* Member of @struct_name type that is the AVL tree node.
* Member of @struct_name type that is the AVL tree node.
* *
* Example: * Example:
* *
* struct int_wrapper { * struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* }; * };
* *
* void print_ints(struct avl_tree_node *root) * void print_ints(struct avl_tree_node *root)
* { * {
* struct int_wrapper *i;
* struct int_wrapper *i;
* *
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node)
* printf("%d\n", i->data);
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node)
* printf("%d\n", i->data);
* } * }
*/ */
#define avl_tree_for_each_in_order(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_first_in_order(root); \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1); \
_cur = avl_tree_next_in_order(_cur))
#define avl_tree_for_each_in_order(child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_first_in_order(root); \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1); \
_cur = avl_tree_next_in_order(_cur))


/* /*
* Like avl_tree_for_each_in_order(), but uses the reverse order. * Like avl_tree_for_each_in_order(), but uses the reverse order.
*/ */
#define avl_tree_for_each_in_reverse_order(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_last_in_order(root); \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1); \
_cur = avl_tree_prev_in_order(_cur))
#define avl_tree_for_each_in_reverse_order( \
child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_last_in_order(root); \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1); \
_cur = avl_tree_prev_in_order(_cur))


/* /*
* Like avl_tree_for_each_in_order(), but iterates through the nodes in * Like avl_tree_for_each_in_order(), but iterates through the nodes in
* postorder, so the current node may be deleted or freed. * postorder, so the current node may be deleted or freed.
*/ */
#define avl_tree_for_each_in_postorder(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_first_in_postorder(root), *_parent; \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1) \
&& (_parent = avl_get_parent(_cur), 1); \
_cur = avl_tree_next_in_postorder(_cur, _parent))
#define avl_tree_for_each_in_postorder(child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_first_in_postorder(root), *_parent; \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1) && \
(_parent = avl_get_parent(_cur), 1); \
_cur = avl_tree_next_in_postorder(_cur, _parent))


#endif /* _AVL_TREE_H_ */ #endif /* _AVL_TREE_H_ */

+ 34
- 33
src/mwparserfromhell/parser/ctokenizer/common.h View File

@@ -23,55 +23,56 @@ SOFTWARE.
#pragma once #pragma once


#ifndef PY_SSIZE_T_CLEAN #ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html
# define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html
#endif #endif


#include <Python.h> #include <Python.h>
#include <structmember.h>
#include <bytesobject.h> #include <bytesobject.h>
#include <structmember.h>


#include "avl_tree.h" #include "avl_tree.h"


/* Compatibility macros */ /* Compatibility macros */


#ifndef uint64_t #ifndef uint64_t
#define uint64_t unsigned PY_LONG_LONG
# define uint64_t unsigned PY_LONG_LONG
#endif #endif


#define malloc PyObject_Malloc // XXX: yuck
#define malloc PyObject_Malloc // XXX: yuck
#define realloc PyObject_Realloc #define realloc PyObject_Realloc
#define free PyObject_Free #define free PyObject_Free


/* Unicode support macros */ /* Unicode support macros */


#define PyUnicode_FROM_SINGLE(chr) \
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)


/* Error handling macros */ /* Error handling macros */


#define BAD_ROUTE self->route_state
#define BAD_ROUTE_CONTEXT self->route_context
#define FAIL_ROUTE(context) { \
self->route_state = 1; \
self->route_context = context; \
}
#define RESET_ROUTE() self->route_state = 0
#define BAD_ROUTE self->route_state
#define BAD_ROUTE_CONTEXT self->route_context
#define FAIL_ROUTE(context) \
do { \
self->route_state = 1; \
self->route_context = context; \
} while (0)
#define RESET_ROUTE() self->route_state = 0


/* Shared globals */ /* Shared globals */


extern char** entitydefs;
extern char **entitydefs;


extern PyObject* NOARGS;
extern PyObject* definitions;
extern PyObject *NOARGS;
extern PyObject *definitions;


/* Structs */ /* Structs */


typedef struct { typedef struct {
Py_ssize_t capacity; Py_ssize_t capacity;
Py_ssize_t length; Py_ssize_t length;
PyObject* object;
PyObject *object;
int kind; int kind;
void* data;
void *data;
} Textbuffer; } Textbuffer;


typedef struct { typedef struct {
@@ -80,19 +81,19 @@ typedef struct {
} StackIdent; } StackIdent;


struct Stack { struct Stack {
PyObject* stack;
PyObject *stack;
uint64_t context; uint64_t context;
Textbuffer* textbuffer;
Textbuffer *textbuffer;
StackIdent ident; StackIdent ident;
struct Stack* next;
struct Stack *next;
}; };
typedef struct Stack Stack; typedef struct Stack Stack;


typedef struct { typedef struct {
PyObject* object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */
int kind; /* object's kind value */
void* data; /* object's raw unicode buffer */
PyObject *object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */
int kind; /* object's kind value */
void *data; /* object's raw unicode buffer */
} TokenizerInput; } TokenizerInput;


typedef struct avl_tree_node avl_tree; typedef struct avl_tree_node avl_tree;
@@ -104,13 +105,13 @@ typedef struct {


typedef struct { typedef struct {
PyObject_HEAD PyObject_HEAD
TokenizerInput text; /* text to tokenize */
Stack* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
int global; /* global context */
int depth; /* stack recursion depth */
int route_state; /* whether a BadRoute has been triggered */
uint64_t route_context; /* context when the last BadRoute was triggered */
avl_tree* bad_routes; /* stack idents for routes known to fail */
int skip_style_tags; /* temp fix for the sometimes broken tag parser */
TokenizerInput text; /* text to tokenize */
Stack *topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
int global; /* global context */
int depth; /* stack recursion depth */
int route_state; /* whether a BadRoute has been triggered */
uint64_t route_context; /* context when the last BadRoute was triggered */
avl_tree *bad_routes; /* stack idents for routes known to fail */
int skip_style_tags; /* temp fix for the sometimes broken tag parser */
} Tokenizer; } Tokenizer;

+ 11
- 5
src/mwparserfromhell/parser/ctokenizer/contexts.h View File

@@ -89,11 +89,17 @@ SOFTWARE.


/* Aggregate contexts */ /* Aggregate contexts */


#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)
#define AGG_FAIL \
(LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | \
LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE \
(LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | \
LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS \
(LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS \
(LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)


/* Tag contexts */ /* Tag contexts */




+ 31
- 23
src/mwparserfromhell/parser/ctokenizer/definitions.c View File

@@ -27,7 +27,8 @@ SOFTWARE.
See the Python version for data sources. See the Python version for data sources.
*/ */


static const char* URI_SCHEMES[] = {
// clang-format off
static const char *URI_SCHEMES[] = {
"bitcoin", "bitcoin",
"ftp", "ftp",
"ftps", "ftps",
@@ -55,10 +56,10 @@ static const char* URI_SCHEMES[] = {
"urn", "urn",
"worldwind", "worldwind",
"xmpp", "xmpp",
NULL,
NULL,
}; };


static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
static const char *URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
"bitcoin", "bitcoin",
"geo", "geo",
"magnet", "magnet",
@@ -73,7 +74,7 @@ static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
NULL, NULL,
}; };


static const char* PARSER_BLACKLIST[] = {
static const char *PARSER_BLACKLIST[] = {
"categorytree", "categorytree",
"ce", "ce",
"chem", "chem",
@@ -93,32 +94,32 @@ static const char* PARSER_BLACKLIST[] = {
"timeline", "timeline",
NULL, NULL,
}; };
// clang-format on


static const char* SINGLE[] = {
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td",
"tr", NULL
};
static const char *SINGLE[] = {
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL};


static const char* SINGLE_ONLY[] = {
"br", "wbr", "hr", "meta", "link", "img", NULL
};
static const char *SINGLE_ONLY[] = {"br", "wbr", "hr", "meta", "link", "img", NULL};


/* /*
Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in
the second argument. The caller must free the return value when finished. the second argument. The caller must free the return value when finished.
If the return value is NULL, the conversion failed and *string is not set. If the return value is NULL, the conversion failed and *string is not set.
*/ */
static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string)
static PyObject *
unicode_to_lcase_ascii(PyObject *input, const char **string)
{ {
PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes; PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes;


if (!lower)
if (!lower) {
return NULL; return NULL;
}
bytes = PyUnicode_AsASCIIString(lower); bytes = PyUnicode_AsASCIIString(lower);
Py_DECREF(lower); Py_DECREF(lower);
if (!bytes) { if (!bytes) {
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
PyErr_Clear(); PyErr_Clear();
}
return NULL; return NULL;
} }
*string = PyBytes_AS_STRING(bytes); *string = PyBytes_AS_STRING(bytes);
@@ -128,14 +129,16 @@ static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string)
/* /*
Return whether a PyUnicodeObject is in a list of lowercase ASCII strings. Return whether a PyUnicodeObject is in a list of lowercase ASCII strings.
*/ */
static int unicode_in_string_list(PyObject *input, const char **list)
static int
unicode_in_string_list(PyObject *input, const char **list)
{ {
const char *string; const char *string;
PyObject *temp = unicode_to_lcase_ascii(input, &string); PyObject *temp = unicode_to_lcase_ascii(input, &string);
int retval = 0; int retval = 0;


if (!temp)
if (!temp) {
return 0; return 0;
}


while (*list) { while (*list) {
if (!strcmp(*(list++), string)) { if (!strcmp(*(list++), string)) {
@@ -144,7 +147,7 @@ static int unicode_in_string_list(PyObject *input, const char **list)
} }
} }


end:
end:
Py_DECREF(temp); Py_DECREF(temp);
return retval; return retval;
} }
@@ -152,7 +155,8 @@ static int unicode_in_string_list(PyObject *input, const char **list)
/* /*
Return if the given tag's contents should be passed to the parser. Return if the given tag's contents should be passed to the parser.
*/ */
int is_parsable(PyObject *tag)
int
is_parsable(PyObject *tag)
{ {
return !unicode_in_string_list(tag, PARSER_BLACKLIST); return !unicode_in_string_list(tag, PARSER_BLACKLIST);
} }
@@ -160,7 +164,8 @@ int is_parsable(PyObject *tag)
/* /*
Return whether or not the given tag can exist without a close tag. Return whether or not the given tag can exist without a close tag.
*/ */
int is_single(PyObject *tag)
int
is_single(PyObject *tag)
{ {
return unicode_in_string_list(tag, SINGLE); return unicode_in_string_list(tag, SINGLE);
} }
@@ -168,7 +173,8 @@ int is_single(PyObject *tag)
/* /*
Return whether or not the given tag must exist without a close tag. Return whether or not the given tag must exist without a close tag.
*/ */
int is_single_only(PyObject *tag)
int
is_single_only(PyObject *tag)
{ {
return unicode_in_string_list(tag, SINGLE_ONLY); return unicode_in_string_list(tag, SINGLE_ONLY);
} }
@@ -176,10 +182,12 @@ int is_single_only(PyObject *tag)
/* /*
Return whether the given scheme is valid for external links. Return whether the given scheme is valid for external links.
*/ */
int is_scheme(PyObject *scheme, int slashes)
int
is_scheme(PyObject *scheme, int slashes)
{ {
if (slashes)
if (slashes) {
return unicode_in_string_list(scheme, URI_SCHEMES); return unicode_in_string_list(scheme, URI_SCHEMES);
else
} else {
return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL); return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL);
}
} }

+ 5
- 6
src/mwparserfromhell/parser/ctokenizer/definitions.h View File

@@ -28,12 +28,11 @@ SOFTWARE.


/* Functions */ /* Functions */


int is_parsable(PyObject*);
int is_single(PyObject*);
int is_single_only(PyObject*);
int is_scheme(PyObject*, int);
int is_parsable(PyObject *);
int is_single(PyObject *);
int is_single_only(PyObject *);
int is_scheme(PyObject *, int);


/* Macros */ /* Macros */


#define GET_HTML_TAG(markup) \
(markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")

+ 20
- 14
src/mwparserfromhell/parser/ctokenizer/tag_data.c View File

@@ -26,13 +26,14 @@ SOFTWARE.
/* /*
Initialize a new TagData object. Initialize a new TagData object.
*/ */
TagData* TagData_new(TokenizerInput* text)
TagData *
TagData_new(TokenizerInput *text)
{ {
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
} }


TagData *self = malloc(sizeof(TagData)); TagData *self = malloc(sizeof(TagData));
@@ -54,25 +55,30 @@ TagData* TagData_new(TokenizerInput* text)
/* /*
Deallocate the given TagData object. Deallocate the given TagData object.
*/ */
void TagData_dealloc(TagData* self)
void
TagData_dealloc(TagData *self)
{ {
if (self->pad_first)
if (self->pad_first) {
Textbuffer_dealloc(self->pad_first); Textbuffer_dealloc(self->pad_first);
if (self->pad_before_eq)
}
if (self->pad_before_eq) {
Textbuffer_dealloc(self->pad_before_eq); Textbuffer_dealloc(self->pad_before_eq);
if (self->pad_after_eq)
}
if (self->pad_after_eq) {
Textbuffer_dealloc(self->pad_after_eq); Textbuffer_dealloc(self->pad_after_eq);
}
free(self); free(self);
} }


/* /*
Clear the internal buffers of the given TagData object. Clear the internal buffers of the given TagData object.
*/ */
int TagData_reset_buffers(TagData* self)
int
TagData_reset_buffers(TagData *self)
{ {
if (Textbuffer_reset(self->pad_first) ||
Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq))
if (Textbuffer_reset(self->pad_first) || Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq)) {
return -1; return -1;
}
return 0; return 0;
} }

+ 6
- 6
src/mwparserfromhell/parser/ctokenizer/tag_data.h View File

@@ -29,15 +29,15 @@ SOFTWARE.


typedef struct { typedef struct {
uint64_t context; uint64_t context;
Textbuffer* pad_first;
Textbuffer* pad_before_eq;
Textbuffer* pad_after_eq;
Textbuffer *pad_first;
Textbuffer *pad_before_eq;
Textbuffer *pad_after_eq;
Py_UCS4 quoter; Py_UCS4 quoter;
Py_ssize_t reset; Py_ssize_t reset;
} TagData; } TagData;


/* Functions */ /* Functions */


TagData* TagData_new(TokenizerInput*);
void TagData_dealloc(TagData*);
int TagData_reset_buffers(TagData*);
TagData *TagData_new(TokenizerInput *);
void TagData_dealloc(TagData *);
int TagData_reset_buffers(TagData *);

+ 45
- 26
src/mwparserfromhell/parser/ctokenizer/textbuffer.c View File

@@ -23,20 +23,22 @@ SOFTWARE.
#include "textbuffer.h" #include "textbuffer.h"


#define INITIAL_CAPACITY 32 #define INITIAL_CAPACITY 32
#define RESIZE_FACTOR 2
#define CONCAT_EXTRA 32
#define RESIZE_FACTOR 2
#define CONCAT_EXTRA 32


/* /*
Internal allocation function for textbuffers. Internal allocation function for textbuffers.
*/ */
static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar)
static int
internal_alloc(Textbuffer *self, Py_UCS4 maxchar)
{ {
self->capacity = INITIAL_CAPACITY; self->capacity = INITIAL_CAPACITY;
self->length = 0; self->length = 0;


self->object = PyUnicode_New(self->capacity, maxchar); self->object = PyUnicode_New(self->capacity, maxchar);
if (!self->object)
if (!self->object) {
return -1; return -1;
}
self->kind = PyUnicode_KIND(self->object); self->kind = PyUnicode_KIND(self->object);
self->data = PyUnicode_DATA(self->object); self->data = PyUnicode_DATA(self->object);


@@ -46,7 +48,8 @@ static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar)
/* /*
Internal deallocation function for textbuffers. Internal deallocation function for textbuffers.
*/ */
static void internal_dealloc(Textbuffer* self)
static void
internal_dealloc(Textbuffer *self)
{ {
Py_DECREF(self->object); Py_DECREF(self->object);
} }
@@ -54,14 +57,16 @@ static void internal_dealloc(Textbuffer* self)
/* /*
Internal resize function. Internal resize function.
*/ */
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
static int
internal_resize(Textbuffer *self, Py_ssize_t new_cap)
{ {
PyObject *newobj; PyObject *newobj;
void *newdata; void *newdata;


newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object)); newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
if (!newobj)
if (!newobj) {
return -1; return -1;
}
newdata = PyUnicode_DATA(newobj); newdata = PyUnicode_DATA(newobj);
memcpy(newdata, self->data, self->length * self->kind); memcpy(newdata, self->data, self->length * self->kind);
Py_DECREF(self->object); Py_DECREF(self->object);
@@ -75,22 +80,25 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
/* /*
Create a new textbuffer object. Create a new textbuffer object.
*/ */
Textbuffer* Textbuffer_new(TokenizerInput* text)
Textbuffer *
Textbuffer_new(TokenizerInput *text)
{ {
Textbuffer* self = malloc(sizeof(Textbuffer));
Textbuffer *self = malloc(sizeof(Textbuffer));
Py_UCS4 maxchar = 0; Py_UCS4 maxchar = 0;


maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);


if (!self)
if (!self) {
goto fail_nomem; goto fail_nomem;
if (internal_alloc(self, maxchar) < 0)
}
if (internal_alloc(self, maxchar) < 0) {
goto fail_dealloc; goto fail_dealloc;
}
return self; return self;


fail_dealloc:
fail_dealloc:
free(self); free(self);
fail_nomem:
fail_nomem:
PyErr_NoMemory(); PyErr_NoMemory();
return NULL; return NULL;
} }
@@ -98,7 +106,8 @@ Textbuffer* Textbuffer_new(TokenizerInput* text)
/* /*
Deallocate the given textbuffer. Deallocate the given textbuffer.
*/ */
void Textbuffer_dealloc(Textbuffer* self)
void
Textbuffer_dealloc(Textbuffer *self)
{ {
internal_dealloc(self); internal_dealloc(self);
free(self); free(self);
@@ -107,26 +116,30 @@ void Textbuffer_dealloc(Textbuffer* self)
/* /*
Reset a textbuffer to its initial, empty state. Reset a textbuffer to its initial, empty state.
*/ */
int Textbuffer_reset(Textbuffer* self)
int
Textbuffer_reset(Textbuffer *self)
{ {
Py_UCS4 maxchar = 0; Py_UCS4 maxchar = 0;


maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);


internal_dealloc(self); internal_dealloc(self);
if (internal_alloc(self, maxchar))
if (internal_alloc(self, maxchar)) {
return -1; return -1;
}
return 0; return 0;
} }


/* /*
Write a Unicode codepoint to the given textbuffer. Write a Unicode codepoint to the given textbuffer.
*/ */
int Textbuffer_write(Textbuffer* self, Py_UCS4 code)
int
Textbuffer_write(Textbuffer *self, Py_UCS4 code)
{ {
if (self->length >= self->capacity) { if (self->length >= self->capacity) {
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) {
return -1; return -1;
}
} }


PyUnicode_WRITE(self->kind, self->data, self->length++, code); PyUnicode_WRITE(self->kind, self->data, self->length++, code);
@@ -139,7 +152,8 @@ int Textbuffer_write(Textbuffer* self, Py_UCS4 code)


This function does not check for bounds. This function does not check for bounds.
*/ */
Py_UCS4 Textbuffer_read(Textbuffer* self, Py_ssize_t index)
Py_UCS4
Textbuffer_read(Textbuffer *self, Py_ssize_t index)
{ {
return PyUnicode_READ(self->kind, self->data, index); return PyUnicode_READ(self->kind, self->data, index);
} }
@@ -147,7 +161,8 @@ Py_UCS4 Textbuffer_read(Textbuffer* self, Py_ssize_t index)
/* /*
Return the contents of the textbuffer as a Python Unicode object. Return the contents of the textbuffer as a Python Unicode object.
*/ */
PyObject* Textbuffer_render(Textbuffer* self)
PyObject *
Textbuffer_render(Textbuffer *self)
{ {
return PyUnicode_FromKindAndData(self->kind, self->data, self->length); return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
} }
@@ -155,17 +170,20 @@ PyObject* Textbuffer_render(Textbuffer* self)
/* /*
Concatenate the 'other' textbuffer onto the end of the given textbuffer. Concatenate the 'other' textbuffer onto the end of the given textbuffer.
*/ */
int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
int
Textbuffer_concat(Textbuffer *self, Textbuffer *other)
{ {
Py_ssize_t newlen = self->length + other->length; Py_ssize_t newlen = self->length + other->length;


if (newlen > self->capacity) { if (newlen > self->capacity) {
if (internal_resize(self, newlen + CONCAT_EXTRA) < 0)
if (internal_resize(self, newlen + CONCAT_EXTRA) < 0) {
return -1; return -1;
}
} }


assert(self->kind == other->kind); assert(self->kind == other->kind);
memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
memcpy(((Py_UCS1 *) self->data) + self->kind * self->length,
other->data,
other->length * other->kind); other->length * other->kind);


self->length = newlen; self->length = newlen;
@@ -175,15 +193,16 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
/* /*
Reverse the contents of the given textbuffer. Reverse the contents of the given textbuffer.
*/ */
void Textbuffer_reverse(Textbuffer* self)
void
Textbuffer_reverse(Textbuffer *self)
{ {
Py_ssize_t i, end = self->length - 1; Py_ssize_t i, end = self->length - 1;
Py_UCS4 tmp; Py_UCS4 tmp;


for (i = 0; i < self->length / 2; i++) { for (i = 0; i < self->length / 2; i++) {
tmp = PyUnicode_READ(self->kind, self->data, i); tmp = PyUnicode_READ(self->kind, self->data, i);
PyUnicode_WRITE(self->kind, self->data, i,
PyUnicode_READ(self->kind, self->data, end - i));
PyUnicode_WRITE(
self->kind, self->data, i, PyUnicode_READ(self->kind, self->data, end - i));
PyUnicode_WRITE(self->kind, self->data, end - i, tmp); PyUnicode_WRITE(self->kind, self->data, end - i, tmp);
} }
} }

+ 8
- 8
src/mwparserfromhell/parser/ctokenizer/textbuffer.h View File

@@ -26,11 +26,11 @@ SOFTWARE.


/* Functions */ /* Functions */


Textbuffer* Textbuffer_new(TokenizerInput*);
void Textbuffer_dealloc(Textbuffer*);
int Textbuffer_reset(Textbuffer*);
int Textbuffer_write(Textbuffer*, Py_UCS4);
Py_UCS4 Textbuffer_read(Textbuffer*, Py_ssize_t);
PyObject* Textbuffer_render(Textbuffer*);
int Textbuffer_concat(Textbuffer*, Textbuffer*);
void Textbuffer_reverse(Textbuffer*);
Textbuffer *Textbuffer_new(TokenizerInput *);
void Textbuffer_dealloc(Textbuffer *);
int Textbuffer_reset(Textbuffer *);
int Textbuffer_write(Textbuffer *, Py_UCS4);
Py_UCS4 Textbuffer_read(Textbuffer *, Py_ssize_t);
PyObject *Textbuffer_render(Textbuffer *);
int Textbuffer_concat(Textbuffer *, Textbuffer *);
void Textbuffer_reverse(Textbuffer *);

+ 966
- 685
src/mwparserfromhell/parser/ctokenizer/tok_parse.c
File diff suppressed because it is too large
View File


+ 4
- 3
src/mwparserfromhell/parser/ctokenizer/tok_parse.h View File

@@ -25,11 +25,12 @@ SOFTWARE.
#include "common.h" #include "common.h"


static const Py_UCS4 MARKERS[] = { static const Py_UCS4 MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '!', '\n', '\0'};
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'',
'#', '*', ';', ':', '/', '-', '!', '\n', '\0',
};


#define NUM_MARKERS 19 #define NUM_MARKERS 19


/* Functions */ /* Functions */


PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
PyObject *Tokenizer_parse(Tokenizer *, uint64_t, int);

+ 122
- 79
src/mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -27,9 +27,10 @@ SOFTWARE.
/* /*
Add a new token stack, context, and textbuffer to the list. Add a new token stack, context, and textbuffer to the list.
*/ */
int Tokenizer_push(Tokenizer* self, uint64_t context)
int
Tokenizer_push(Tokenizer *self, uint64_t context)
{ {
Stack* top = malloc(sizeof(Stack));
Stack *top = malloc(sizeof(Stack));


if (!top) { if (!top) {
PyErr_NoMemory(); PyErr_NoMemory();
@@ -38,8 +39,9 @@ int Tokenizer_push(Tokenizer* self, uint64_t context)
top->stack = PyList_New(0); top->stack = PyList_New(0);
top->context = context; top->context = context;
top->textbuffer = Textbuffer_new(&self->text); top->textbuffer = Textbuffer_new(&self->text);
if (!top->textbuffer)
if (!top->textbuffer) {
return -1; return -1;
}
top->ident.head = self->head; top->ident.head = self->head;
top->ident.context = context; top->ident.context = context;
top->next = self->topstack; top->next = self->topstack;
@@ -51,16 +53,19 @@ int Tokenizer_push(Tokenizer* self, uint64_t context)
/* /*
Push the textbuffer onto the stack as a Text node and clear it. Push the textbuffer onto the stack as a Text node and clear it.
*/ */
int Tokenizer_push_textbuffer(Tokenizer* self)
int
Tokenizer_push_textbuffer(Tokenizer *self)
{ {
PyObject *text, *kwargs, *token; PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer;
Textbuffer *buffer = self->topstack->textbuffer;


if (buffer->length == 0)
if (buffer->length == 0) {
return 0; return 0;
}
text = Textbuffer_render(buffer); text = Textbuffer_render(buffer);
if (!text)
if (!text) {
return -1; return -1;
}
kwargs = PyDict_New(); kwargs = PyDict_New();
if (!kwargs) { if (!kwargs) {
Py_DECREF(text); Py_DECREF(text);
@@ -70,24 +75,27 @@ int Tokenizer_push_textbuffer(Tokenizer* self)
Py_DECREF(text); Py_DECREF(text);
token = PyObject_Call(Text, NOARGS, kwargs); token = PyObject_Call(Text, NOARGS, kwargs);
Py_DECREF(kwargs); Py_DECREF(kwargs);
if (!token)
if (!token) {
return -1; return -1;
}
if (PyList_Append(self->topstack->stack, token)) { if (PyList_Append(self->topstack->stack, token)) {
Py_DECREF(token); Py_DECREF(token);
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);
if (Textbuffer_reset(buffer))
if (Textbuffer_reset(buffer)) {
return -1; return -1;
}
return 0; return 0;
} }


/* /*
Pop and deallocate the top token stack/context/textbuffer. Pop and deallocate the top token stack/context/textbuffer.
*/ */
void Tokenizer_delete_top_of_stack(Tokenizer* self)
void
Tokenizer_delete_top_of_stack(Tokenizer *self)
{ {
Stack* top = self->topstack;
Stack *top = self->topstack;


Py_DECREF(top->stack); Py_DECREF(top->stack);
Textbuffer_dealloc(top->textbuffer); Textbuffer_dealloc(top->textbuffer);
@@ -99,12 +107,14 @@ void Tokenizer_delete_top_of_stack(Tokenizer* self)
/* /*
Pop the current stack/context/textbuffer, returing the stack. Pop the current stack/context/textbuffer, returing the stack.
*/ */
PyObject* Tokenizer_pop(Tokenizer* self)
PyObject *
Tokenizer_pop(Tokenizer *self)
{ {
PyObject* stack;
PyObject *stack;


if (Tokenizer_push_textbuffer(self))
if (Tokenizer_push_textbuffer(self)) {
return NULL; return NULL;
}
stack = self->topstack->stack; stack = self->topstack->stack;
Py_INCREF(stack); Py_INCREF(stack);
Tokenizer_delete_top_of_stack(self); Tokenizer_delete_top_of_stack(self);
@@ -115,13 +125,15 @@ PyObject* Tokenizer_pop(Tokenizer* self)
Pop the current stack/context/textbuffer, returing the stack. We will also Pop the current stack/context/textbuffer, returing the stack. We will also
replace the underlying stack's context with the current stack's. replace the underlying stack's context with the current stack's.
*/ */
PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
PyObject *
Tokenizer_pop_keeping_context(Tokenizer *self)
{ {
PyObject* stack;
PyObject *stack;
uint64_t context; uint64_t context;


if (Tokenizer_push_textbuffer(self))
if (Tokenizer_push_textbuffer(self)) {
return NULL; return NULL;
}
stack = self->topstack->stack; stack = self->topstack->stack;
Py_INCREF(stack); Py_INCREF(stack);
context = self->topstack->context; context = self->topstack->context;
@@ -133,16 +145,18 @@ PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
/* /*
Compare two route_tree_nodes that are in their avl_tree_node forms. Compare two route_tree_nodes that are in their avl_tree_node forms.
*/ */
static int compare_nodes(
const struct avl_tree_node* na, const struct avl_tree_node* nb)
static int
compare_nodes(const struct avl_tree_node *na, const struct avl_tree_node *nb)
{ {
route_tree_node *a = avl_tree_entry(na, route_tree_node, node); route_tree_node *a = avl_tree_entry(na, route_tree_node, node);
route_tree_node *b = avl_tree_entry(nb, route_tree_node, node); route_tree_node *b = avl_tree_entry(nb, route_tree_node, node);


if (a->id.head < b->id.head)
if (a->id.head < b->id.head) {
return -1; return -1;
if (a->id.head > b->id.head)
}
if (a->id.head > b->id.head) {
return 1; return 1;
}
return (a->id.context > b->id.context) - (a->id.context < b->id.context); return (a->id.context > b->id.context) - (a->id.context < b->id.context);
} }


@@ -152,13 +166,15 @@ static int compare_nodes(
This will be noticed when calling Tokenizer_check_route with the same head This will be noticed when calling Tokenizer_check_route with the same head
and context, and the route will be failed immediately. and context, and the route will be failed immediately.
*/ */
void Tokenizer_memoize_bad_route(Tokenizer *self)
void
Tokenizer_memoize_bad_route(Tokenizer *self)
{ {
route_tree_node *node = malloc(sizeof(route_tree_node)); route_tree_node *node = malloc(sizeof(route_tree_node));
if (node) { if (node) {
node->id = self->topstack->ident; node->id = self->topstack->ident;
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) {
free(node); free(node);
}
} }
} }


@@ -168,10 +184,11 @@ void Tokenizer_memoize_bad_route(Tokenizer *self)
ident of the failed stack so future parsing attempts down this route can be ident of the failed stack so future parsing attempts down this route can be
stopped early. stopped early.
*/ */
void* Tokenizer_fail_route(Tokenizer* self)
void *
Tokenizer_fail_route(Tokenizer *self)
{ {
uint64_t context = self->topstack->context; uint64_t context = self->topstack->context;
PyObject* stack;
PyObject *stack;


Tokenizer_memoize_bad_route(self); Tokenizer_memoize_bad_route(self);
stack = Tokenizer_pop(self); stack = Tokenizer_pop(self);
@@ -193,10 +210,11 @@ void* Tokenizer_fail_route(Tokenizer* self)
but this would introduce too much overhead in C tokenizer due to the need but this would introduce too much overhead in C tokenizer due to the need
to check for a bad route after every call to Tokenizer_push.) to check for a bad route after every call to Tokenizer_push.)
*/ */
int Tokenizer_check_route(Tokenizer* self, uint64_t context)
int
Tokenizer_check_route(Tokenizer *self, uint64_t context)
{ {
StackIdent ident = {self->head, context}; StackIdent ident = {self->head, context};
struct avl_tree_node *node = (struct avl_tree_node*) (&ident + 1);
struct avl_tree_node *node = (struct avl_tree_node *) (&ident + 1);


if (avl_tree_lookup_node(self->bad_routes, node, compare_nodes)) { if (avl_tree_lookup_node(self->bad_routes, node, compare_nodes)) {
FAIL_ROUTE(context); FAIL_ROUTE(context);
@@ -209,7 +227,8 @@ int Tokenizer_check_route(Tokenizer* self, uint64_t context)
Free the tokenizer's bad route cache tree. Intended to be called by the Free the tokenizer's bad route cache tree. Intended to be called by the
main tokenizer function after parsing is finished. main tokenizer function after parsing is finished.
*/ */
void Tokenizer_free_bad_route_tree(Tokenizer *self)
void
Tokenizer_free_bad_route_tree(Tokenizer *self)
{ {
struct avl_tree_node *cur = avl_tree_first_in_postorder(self->bad_routes); struct avl_tree_node *cur = avl_tree_first_in_postorder(self->bad_routes);
struct avl_tree_node *parent; struct avl_tree_node *parent;
@@ -225,17 +244,20 @@ void Tokenizer_free_bad_route_tree(Tokenizer *self)
/* /*
Write a token to the current token stack. Write a token to the current token stack.
*/ */
int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
int
Tokenizer_emit_token(Tokenizer *self, PyObject *token, int first)
{ {
PyObject* instance;
PyObject *instance;


if (Tokenizer_push_textbuffer(self))
if (Tokenizer_push_textbuffer(self)) {
return -1; return -1;
}
instance = PyObject_CallObject(token, NULL); instance = PyObject_CallObject(token, NULL);
if (!instance)
if (!instance) {
return -1; return -1;
if (first ? PyList_Insert(self->topstack->stack, 0, instance) :
PyList_Append(self->topstack->stack, instance)) {
}
if (first ? PyList_Insert(self->topstack->stack, 0, instance)
: PyList_Append(self->topstack->stack, instance)) {
Py_DECREF(instance); Py_DECREF(instance);
return -1; return -1;
} }
@@ -247,10 +269,13 @@ int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
Write a token to the current token stack, with kwargs. Steals a reference Write a token to the current token stack, with kwargs. Steals a reference
to kwargs. to kwargs.
*/ */
int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
PyObject* kwargs, int first)
int
Tokenizer_emit_token_kwargs(Tokenizer *self,
PyObject *token,
PyObject *kwargs,
int first)
{ {
PyObject* instance;
PyObject *instance;


if (Tokenizer_push_textbuffer(self)) { if (Tokenizer_push_textbuffer(self)) {
Py_DECREF(kwargs); Py_DECREF(kwargs);
@@ -261,8 +286,8 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
Py_DECREF(kwargs); Py_DECREF(kwargs);
return -1; return -1;
} }
if (first ? PyList_Insert(self->topstack->stack, 0, instance):
PyList_Append(self->topstack->stack, instance)) {
if (first ? PyList_Insert(self->topstack->stack, 0, instance)
: PyList_Append(self->topstack->stack, instance)) {
Py_DECREF(instance); Py_DECREF(instance);
Py_DECREF(kwargs); Py_DECREF(kwargs);
return -1; return -1;
@@ -275,7 +300,8 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
/* /*
Write a Unicode codepoint to the current textbuffer. Write a Unicode codepoint to the current textbuffer.
*/ */
int Tokenizer_emit_char(Tokenizer* self, Py_UCS4 code)
int
Tokenizer_emit_char(Tokenizer *self, Py_UCS4 code)
{ {
return Textbuffer_write(self->topstack->textbuffer, code); return Textbuffer_write(self->topstack->textbuffer, code);
} }
@@ -283,13 +309,15 @@ int Tokenizer_emit_char(Tokenizer* self, Py_UCS4 code)
/* /*
Write a string of text to the current textbuffer. Write a string of text to the current textbuffer.
*/ */
int Tokenizer_emit_text(Tokenizer* self, const char* text)
int
Tokenizer_emit_text(Tokenizer *self, const char *text)
{ {
int i = 0; int i = 0;


while (text[i]) { while (text[i]) {
if (Tokenizer_emit_char(self, text[i]))
if (Tokenizer_emit_char(self, text[i])) {
return -1; return -1;
}
i++; i++;
} }
return 0; return 0;
@@ -299,7 +327,8 @@ int Tokenizer_emit_text(Tokenizer* self, const char* text)
Write the contents of another textbuffer to the current textbuffer, Write the contents of another textbuffer to the current textbuffer,
deallocating it in the process. deallocating it in the process.
*/ */
int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
int
Tokenizer_emit_textbuffer(Tokenizer *self, Textbuffer *buffer)
{ {
int retval = Textbuffer_concat(self->topstack->textbuffer, buffer); int retval = Textbuffer_concat(self->topstack->textbuffer, buffer);
Textbuffer_dealloc(buffer); Textbuffer_dealloc(buffer);
@@ -309,55 +338,63 @@ int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
/* /*
Write a series of tokens to the current stack at once. Write a series of tokens to the current stack at once.
*/ */
int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
int
Tokenizer_emit_all(Tokenizer *self, PyObject *tokenlist)
{ {
int pushed = 0; int pushed = 0;
PyObject *stack, *token, *left, *right, *text; PyObject *stack, *token, *left, *right, *text;
Textbuffer* buffer;
Textbuffer *buffer;
Py_ssize_t size; Py_ssize_t size;


if (PyList_GET_SIZE(tokenlist) > 0) { if (PyList_GET_SIZE(tokenlist) > 0) {
token = PyList_GET_ITEM(tokenlist, 0); token = PyList_GET_ITEM(tokenlist, 0);
switch (PyObject_IsInstance(token, Text)) { switch (PyObject_IsInstance(token, Text)) {
case 0:
case 0:
break;
case 1: {
pushed = 1;
buffer = self->topstack->textbuffer;
if (buffer->length == 0) {
break; break;
case 1: {
pushed = 1;
buffer = self->topstack->textbuffer;
if (buffer->length == 0)
break;
left = Textbuffer_render(buffer);
if (!left)
return -1;
right = PyObject_GetAttrString(token, "text");
if (!right)
return -1;
text = PyUnicode_Concat(left, right);
Py_DECREF(left);
Py_DECREF(right);
if (!text)
return -1;
if (PyObject_SetAttrString(token, "text", text)) {
Py_DECREF(text);
return -1;
}
}
left = Textbuffer_render(buffer);
if (!left) {
return -1;
}
right = PyObject_GetAttrString(token, "text");
if (!right) {
return -1;
}
text = PyUnicode_Concat(left, right);
Py_DECREF(left);
Py_DECREF(right);
if (!text) {
return -1;
}
if (PyObject_SetAttrString(token, "text", text)) {
Py_DECREF(text); Py_DECREF(text);
if (Textbuffer_reset(buffer))
return -1;
break;
return -1;
} }
case -1:
Py_DECREF(text);
if (Textbuffer_reset(buffer)) {
return -1; return -1;
}
break;
}
case -1:
return -1;
} }
} }
if (!pushed) { if (!pushed) {
if (Tokenizer_push_textbuffer(self))
if (Tokenizer_push_textbuffer(self)) {
return -1; return -1;
}
} }
stack = self->topstack->stack; stack = self->topstack->stack;
size = PyList_GET_SIZE(stack); size = PyList_GET_SIZE(stack);
if (PyList_SetSlice(stack, size, size, tokenlist))
if (PyList_SetSlice(stack, size, size, tokenlist)) {
return -1; return -1;
}
return 0; return 0;
} }


@@ -365,9 +402,10 @@ int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
Pop the current stack, write text, and then write the stack. 'text' is a Pop the current stack, write text, and then write the stack. 'text' is a
NULL-terminated array of chars. NULL-terminated array of chars.
*/ */
int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
int
Tokenizer_emit_text_then_stack(Tokenizer *self, const char *text)
{ {
PyObject* stack = Tokenizer_pop(self);
PyObject *stack = Tokenizer_pop(self);


if (Tokenizer_emit_text(self, text)) { if (Tokenizer_emit_text(self, text)) {
Py_DECREF(stack); Py_DECREF(stack);
@@ -389,7 +427,8 @@ int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
/* /*
Internal function to read the codepoint at the given index from the input. Internal function to read the codepoint at the given index from the input.
*/ */
static Py_UCS4 read_codepoint(TokenizerInput* text, Py_ssize_t index)
static Py_UCS4
read_codepoint(TokenizerInput *text, Py_ssize_t index)
{ {
return PyUnicode_READ(text->kind, text->data, index); return PyUnicode_READ(text->kind, text->data, index);
} }
@@ -397,24 +436,28 @@ static Py_UCS4 read_codepoint(TokenizerInput* text, Py_ssize_t index)
/* /*
Read the value at a relative point in the wikicode, forwards. Read the value at a relative point in the wikicode, forwards.
*/ */
Py_UCS4 Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
Py_UCS4
Tokenizer_read(Tokenizer *self, Py_ssize_t delta)
{ {
Py_ssize_t index = self->head + delta; Py_ssize_t index = self->head + delta;


if (index >= self->text.length)
if (index >= self->text.length) {
return '\0'; return '\0';
}
return read_codepoint(&self->text, index); return read_codepoint(&self->text, index);
} }


/* /*
Read the value at a relative point in the wikicode, backwards. Read the value at a relative point in the wikicode, backwards.
*/ */
Py_UCS4 Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
Py_UCS4
Tokenizer_read_backwards(Tokenizer *self, Py_ssize_t delta)
{ {
Py_ssize_t index; Py_ssize_t index;


if (delta > self->head)
if (delta > self->head) {
return '\0'; return '\0';
}
index = self->head - delta; index = self->head - delta;
return read_codepoint(&self->text, index); return read_codepoint(&self->text, index);
} }

+ 28
- 31
src/mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -26,41 +26,38 @@ SOFTWARE.


/* Functions */ /* Functions */


int Tokenizer_push(Tokenizer*, uint64_t);
int Tokenizer_push_textbuffer(Tokenizer*);
void Tokenizer_delete_top_of_stack(Tokenizer*);
PyObject* Tokenizer_pop(Tokenizer*);
PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
void Tokenizer_memoize_bad_route(Tokenizer*);
void* Tokenizer_fail_route(Tokenizer*);
int Tokenizer_check_route(Tokenizer*, uint64_t);
void Tokenizer_free_bad_route_tree(Tokenizer*);
int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
int Tokenizer_emit_char(Tokenizer*, Py_UCS4);
int Tokenizer_emit_text(Tokenizer*, const char*);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
int Tokenizer_emit_all(Tokenizer*, PyObject*);
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);
Py_UCS4 Tokenizer_read(Tokenizer*, Py_ssize_t);
Py_UCS4 Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
int Tokenizer_push(Tokenizer *, uint64_t);
int Tokenizer_push_textbuffer(Tokenizer *);
void Tokenizer_delete_top_of_stack(Tokenizer *);
PyObject *Tokenizer_pop(Tokenizer *);
PyObject *Tokenizer_pop_keeping_context(Tokenizer *);
void Tokenizer_memoize_bad_route(Tokenizer *);
void *Tokenizer_fail_route(Tokenizer *);
int Tokenizer_check_route(Tokenizer *, uint64_t);
void Tokenizer_free_bad_route_tree(Tokenizer *);
int Tokenizer_emit_token(Tokenizer *, PyObject *, int);
int Tokenizer_emit_token_kwargs(Tokenizer *, PyObject *, PyObject *, int);
int Tokenizer_emit_char(Tokenizer *, Py_UCS4);
int Tokenizer_emit_text(Tokenizer *, const char *);
int Tokenizer_emit_textbuffer(Tokenizer *, Textbuffer *);
int Tokenizer_emit_all(Tokenizer *, PyObject *);
int Tokenizer_emit_text_then_stack(Tokenizer *, const char *);
Py_UCS4 Tokenizer_read(Tokenizer *, Py_ssize_t);
Py_UCS4 Tokenizer_read_backwards(Tokenizer *, Py_ssize_t);


/* Macros */ /* Macros */


#define MAX_DEPTH 40
#define Tokenizer_CAN_RECURSE(self) \
(self->depth < MAX_DEPTH)
#define Tokenizer_IS_CURRENT_STACK(self, id) \
(self->topstack->ident.head == (id).head && \
#define MAX_DEPTH 40
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH)
#define Tokenizer_IS_CURRENT_STACK(self, id) \
(self->topstack->ident.head == (id).head && \
self->topstack->ident.context == (id).context) self->topstack->ident.context == (id).context)


#define Tokenizer_emit(self, token) \
Tokenizer_emit_token(self, token, 0)
#define Tokenizer_emit_first(self, token) \
Tokenizer_emit_token(self, token, 1)
#define Tokenizer_emit_kwargs(self, token, kwargs) \
#define Tokenizer_emit(self, token) Tokenizer_emit_token(self, token, 0)
#define Tokenizer_emit_first(self, token) Tokenizer_emit_token(self, token, 1)
#define Tokenizer_emit_kwargs(self, token, kwargs) \
Tokenizer_emit_token_kwargs(self, token, kwargs, 0) Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
#define Tokenizer_emit_first_kwargs(self, token, kwargs) \
#define Tokenizer_emit_first_kwargs(self, token, kwargs) \
Tokenizer_emit_token_kwargs(self, token, kwargs, 1) Tokenizer_emit_token_kwargs(self, token, kwargs, 1)

+ 94
- 67
src/mwparserfromhell/parser/ctokenizer/tokenizer.c View File

@@ -30,12 +30,12 @@ SOFTWARE.
int route_state; int route_state;
uint64_t route_context; uint64_t route_context;


char** entitydefs;
char **entitydefs;


PyObject* NOARGS;
PyObject* definitions;
PyObject *NOARGS;
PyObject *definitions;


static PyObject* ParserError;
static PyObject *ParserError;


/* Forward declarations */ /* Forward declarations */


@@ -44,17 +44,18 @@ static int load_exceptions(void);
/* /*
Create a new tokenizer object. Create a new tokenizer object.
*/ */
static PyObject*
Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
static PyObject *
Tokenizer_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{ {
Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
return (PyObject*) self;
Tokenizer *self = (Tokenizer *) type->tp_alloc(type, 0);
return (PyObject *) self;
} }


/* /*
Deallocate the given tokenizer's text field. Deallocate the given tokenizer's text field.
*/ */
static void dealloc_tokenizer_text(TokenizerInput* text)
static void
dealloc_tokenizer_text(TokenizerInput *text)
{ {
Py_XDECREF(text->object); Py_XDECREF(text->object);
} }
@@ -62,7 +63,8 @@ static void dealloc_tokenizer_text(TokenizerInput* text)
/* /*
Deallocate the given tokenizer object. Deallocate the given tokenizer object.
*/ */
static void Tokenizer_dealloc(Tokenizer* self)
static void
Tokenizer_dealloc(Tokenizer *self)
{ {
Stack *this = self->topstack, *next; Stack *this = self->topstack, *next;
dealloc_tokenizer_text(&self->text); dealloc_tokenizer_text(&self->text);
@@ -74,13 +76,14 @@ static void Tokenizer_dealloc(Tokenizer* self)
free(this); free(this);
this = next; this = next;
} }
Py_TYPE(self)->tp_free((PyObject*) self);
Py_TYPE(self)->tp_free((PyObject *) self);
} }


/* /*
Initialize a new tokenizer instance's text field. Initialize a new tokenizer instance's text field.
*/ */
static void init_tokenizer_text(TokenizerInput* text)
static void
init_tokenizer_text(TokenizerInput *text)
{ {
text->object = Py_None; text->object = Py_None;
Py_INCREF(Py_None); Py_INCREF(Py_None);
@@ -92,12 +95,14 @@ static void init_tokenizer_text(TokenizerInput* text)
/* /*
Initialize a new tokenizer instance by setting instance attributes. Initialize a new tokenizer instance by setting instance attributes.
*/ */
static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
static int
Tokenizer_init(Tokenizer *self, PyObject *args, PyObject *kwds)
{ {
static char* kwlist[] = {NULL};
static char *kwlist[] = {NULL};


if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) {
return -1; return -1;
}
init_tokenizer_text(&self->text); init_tokenizer_text(&self->text);
self->topstack = NULL; self->topstack = NULL;
self->head = self->global = self->depth = 0; self->head = self->global = self->depth = 0;
@@ -110,13 +115,15 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
/* /*
Load input text into the tokenizer. Load input text into the tokenizer.
*/ */
static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
static int
load_tokenizer_text(TokenizerInput *text, PyObject *input)
{ {
dealloc_tokenizer_text(text); dealloc_tokenizer_text(text);
text->object = input; text->object = input;


if (PyUnicode_READY(input) < 0)
if (PyUnicode_READY(input) < 0) {
return -1; return -1;
}
text->kind = PyUnicode_KIND(input); text->kind = PyUnicode_KIND(input);
text->data = PyUnicode_DATA(input); text->data = PyUnicode_DATA(input);
text->length = PyUnicode_GET_LENGTH(input); text->length = PyUnicode_GET_LENGTH(input);
@@ -126,7 +133,8 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
/* /*
Build a list of tokens from a string of wikicode and return it. Build a list of tokens from a string of wikicode and return it.
*/ */
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
static PyObject *
Tokenizer_tokenize(Tokenizer *self, PyObject *args)
{ {
PyObject *input, *tokens; PyObject *input, *tokens;
unsigned long long context = 0; unsigned long long context = 0;
@@ -134,22 +142,25 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)


if (PyArg_ParseTuple(args, "U|Kp", &input, &context, &skip_style_tags)) { if (PyArg_ParseTuple(args, "U|Kp", &input, &context, &skip_style_tags)) {
Py_INCREF(input); Py_INCREF(input);
if (load_tokenizer_text(&self->text, input))
if (load_tokenizer_text(&self->text, input)) {
return NULL; return NULL;
}
else {
}
} else {
const char *encoded; const char *encoded;
Py_ssize_t size; Py_ssize_t size;


/* Failed to parse a Unicode object; try a string instead. */ /* Failed to parse a Unicode object; try a string instead. */
PyErr_Clear(); PyErr_Clear();
if (!PyArg_ParseTuple(args, "s#|Kp", &encoded, &size, &context,
&skip_style_tags))
if (!PyArg_ParseTuple(
args, "s#|Kp", &encoded, &size, &context, &skip_style_tags)) {
return NULL; return NULL;
if (!(input = PyUnicode_FromStringAndSize(encoded, size)))
}
if (!(input = PyUnicode_FromStringAndSize(encoded, size))) {
return NULL; return NULL;
if (load_tokenizer_text(&self->text, input))
}
if (load_tokenizer_text(&self->text, input)) {
return NULL; return NULL;
}
} }


self->head = self->global = self->depth = 0; self->head = self->global = self->depth = 0;
@@ -162,73 +173,83 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)


if (!tokens || self->topstack) { if (!tokens || self->topstack) {
Py_XDECREF(tokens); Py_XDECREF(tokens);
if (PyErr_Occurred())
if (PyErr_Occurred()) {
return NULL; return NULL;
if (!ParserError && load_exceptions() < 0)
}
if (!ParserError && load_exceptions() < 0) {
return NULL; return NULL;
}
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
}
else if (self->topstack)
} else if (self->topstack) {
PyErr_SetString(ParserError, PyErr_SetString(ParserError,
"C tokenizer exited with non-empty token stack"); "C tokenizer exited with non-empty token stack");
else
} else {
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
}
return NULL; return NULL;
} }
return tokens; return tokens;
} }


static int load_entities(void)
static int
load_entities(void)
{ {
PyObject *tempmod, *defmap, *deflist; PyObject *tempmod, *defmap, *deflist;
unsigned numdefs, i; unsigned numdefs, i;
PyObject *string; PyObject *string;


tempmod = PyImport_ImportModule("html.entities"); tempmod = PyImport_ImportModule("html.entities");
if (!tempmod)
if (!tempmod) {
return -1; return -1;
}
defmap = PyObject_GetAttrString(tempmod, "entitydefs"); defmap = PyObject_GetAttrString(tempmod, "entitydefs");
if (!defmap)
if (!defmap) {
return -1; return -1;
}
Py_DECREF(tempmod); Py_DECREF(tempmod);
deflist = PyDict_Keys(defmap); deflist = PyDict_Keys(defmap);
if (!deflist)
if (!deflist) {
return -1; return -1;
}
Py_DECREF(defmap); Py_DECREF(defmap);
numdefs = (unsigned) PyList_GET_SIZE(deflist); numdefs = (unsigned) PyList_GET_SIZE(deflist);
entitydefs = calloc(numdefs + 1, sizeof(char*));
if (!entitydefs)
entitydefs = calloc(numdefs + 1, sizeof(char *));
if (!entitydefs) {
return -1; return -1;
}
for (i = 0; i < numdefs; i++) { for (i = 0; i < numdefs; i++) {
string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i)); string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i));
if (!string)
if (!string) {
return -1; return -1;
}
entitydefs[i] = PyBytes_AsString(string); entitydefs[i] = PyBytes_AsString(string);
if (!entitydefs[i])
if (!entitydefs[i]) {
return -1; return -1;
}
} }
Py_DECREF(deflist); Py_DECREF(deflist);
return 0; return 0;
} }


static int load_tokens(void)
static int
load_tokens(void)
{ {
PyObject *tempmod, *tokens,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = PyUnicode_FromString("tokens");
PyObject *tempmod, *tokens;
PyObject *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *modname = PyUnicode_FromString("tokens");
char *name = "mwparserfromhell.parser"; char *name = "mwparserfromhell.parser";


if (!fromlist || !modname)
if (!fromlist || !modname) {
return -1; return -1;
}
PyList_SET_ITEM(fromlist, 0, modname); PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist); Py_DECREF(fromlist);
if (!tempmod)
if (!tempmod) {
return -1; return -1;
}
tokens = PyObject_GetAttrString(tempmod, "tokens"); tokens = PyObject_GetAttrString(tempmod, "tokens");
Py_DECREF(tempmod); Py_DECREF(tempmod);
load_tokens_from_module(tokens); load_tokens_from_module(tokens);
@@ -236,43 +257,45 @@ static int load_tokens(void)
return 0; return 0;
} }


static int load_defs(void)
static int
load_defs(void)
{ {
PyObject *tempmod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = PyUnicode_FromString("definitions");
PyObject *tempmod;
PyObject *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *modname = PyUnicode_FromString("definitions");
char *name = "mwparserfromhell"; char *name = "mwparserfromhell";


if (!fromlist || !modname)
if (!fromlist || !modname) {
return -1; return -1;
}
PyList_SET_ITEM(fromlist, 0, modname); PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist); Py_DECREF(fromlist);
if (!tempmod)
if (!tempmod) {
return -1; return -1;
}
definitions = PyObject_GetAttrString(tempmod, "definitions"); definitions = PyObject_GetAttrString(tempmod, "definitions");
Py_DECREF(tempmod); Py_DECREF(tempmod);
return 0; return 0;
} }


static int load_exceptions(void)
static int
load_exceptions(void)
{ {
PyObject *tempmod, *parsermod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
*modname = PyUnicode_FromString("parser");
PyObject *tempmod, *parsermod;
PyObject *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(),
*fromlist = PyList_New(1), *modname = PyUnicode_FromString("parser");
char *name = "mwparserfromhell"; char *name = "mwparserfromhell";


if (!fromlist || !modname)
if (!fromlist || !modname) {
return -1; return -1;
}
PyList_SET_ITEM(fromlist, 0, modname); PyList_SET_ITEM(fromlist, 0, modname);
tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
Py_DECREF(fromlist); Py_DECREF(fromlist);
if (!tempmod)
if (!tempmod) {
return -1; return -1;
}
parsermod = PyObject_GetAttrString(tempmod, "parser"); parsermod = PyObject_GetAttrString(tempmod, "parser");
Py_DECREF(tempmod); Py_DECREF(tempmod);
ParserError = PyObject_GetAttrString(parsermod, "ParserError"); ParserError = PyObject_GetAttrString(parsermod, "ParserError");
@@ -280,22 +303,26 @@ static int load_exceptions(void)
return 0; return 0;
} }


PyMODINIT_FUNC PyInit__tokenizer(void)
PyMODINIT_FUNC
PyInit__tokenizer(void)
{ {
PyObject *module; PyObject *module;


TokenizerType.tp_new = PyType_GenericNew; TokenizerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&TokenizerType) < 0)
if (PyType_Ready(&TokenizerType) < 0) {
return NULL; return NULL;
}
module = PyModule_Create(&module_def); module = PyModule_Create(&module_def);
if (!module)
if (!module) {
return NULL; return NULL;
}
Py_INCREF(&TokenizerType); Py_INCREF(&TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
PyModule_AddObject(module, "CTokenizer", (PyObject *) &TokenizerType);
Py_INCREF(Py_True); Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
NOARGS = PyTuple_New(0); NOARGS = PyTuple_New(0);
if (!NOARGS || load_entities() || load_tokens() || load_defs())
if (!NOARGS || load_entities() || load_tokens() || load_defs()) {
return NULL; return NULL;
}
return module; return module;
} }

+ 56
- 47
src/mwparserfromhell/parser/ctokenizer/tokenizer.h View File

@@ -27,67 +27,76 @@ SOFTWARE.


/* Functions */ /* Functions */


static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);
static PyObject *Tokenizer_new(PyTypeObject *, PyObject *, PyObject *);
static void Tokenizer_dealloc(Tokenizer *);
static int Tokenizer_init(Tokenizer *, PyObject *, PyObject *);
static PyObject *Tokenizer_tokenize(Tokenizer *, PyObject *);


/* Structs */ /* Structs */


static PyMethodDef Tokenizer_methods[] = { static PyMethodDef Tokenizer_methods[] = {
{"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
"Build a list of tokens from a string of wikicode and return it."},
{NULL}
{
"tokenize",
(PyCFunction) Tokenizer_tokenize,
METH_VARARGS,
"Build a list of tokens from a string of wikicode and return it.",
},
{NULL},
}; };


static PyMemberDef Tokenizer_members[] = { static PyMemberDef Tokenizer_members[] = {
{NULL}
{NULL},
}; };


static PyTypeObject TokenizerType = { static PyTypeObject TokenizerType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_tokenizer.CTokenizer", /* tp_name */
sizeof(Tokenizer), /* tp_basicsize */
0, /* tp_itemsize */
(destructor) Tokenizer_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"Creates a list of tokens from a string of wikicode.", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
Tokenizer_methods, /* tp_methods */
Tokenizer_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc) Tokenizer_init, /* tp_init */
0, /* tp_alloc */
Tokenizer_new, /* tp_new */
PyVarObject_HEAD_INIT(NULL, 0) /* header */
"_tokenizer.CTokenizer", /* tp_name */
sizeof(Tokenizer), /* tp_basicsize */
0, /* tp_itemsize */
(destructor) Tokenizer_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"Creates a list of tokens from a string of wikicode.", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
Tokenizer_methods, /* tp_methods */
Tokenizer_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc) Tokenizer_init, /* tp_init */
0, /* tp_alloc */
Tokenizer_new, /* tp_new */
}; };


static PyModuleDef module_def = { static PyModuleDef module_def = {
PyModuleDef_HEAD_INIT, PyModuleDef_HEAD_INIT,
"_tokenizer", "_tokenizer",
"Creates a list of tokens from a string of wikicode.", "Creates a list of tokens from a string of wikicode.",
-1, NULL, NULL, NULL, NULL, NULL
-1,
NULL,
NULL,
NULL,
NULL,
NULL,
}; };

+ 42
- 44
src/mwparserfromhell/parser/ctokenizer/tokens.c View File

@@ -24,56 +24,55 @@ SOFTWARE.


/* Globals */ /* Globals */


PyObject* Text;
PyObject* TemplateOpen;
PyObject* TemplateParamSeparator;
PyObject* TemplateParamEquals;
PyObject* TemplateClose;
PyObject* ArgumentOpen;
PyObject* ArgumentSeparator;
PyObject* ArgumentClose;
PyObject* WikilinkOpen;
PyObject* WikilinkSeparator;
PyObject* WikilinkClose;
PyObject* ExternalLinkOpen;
PyObject* ExternalLinkSeparator;
PyObject* ExternalLinkClose;
PyObject* HTMLEntityStart;
PyObject* HTMLEntityNumeric;
PyObject* HTMLEntityHex;
PyObject* HTMLEntityEnd;
PyObject* HeadingStart;
PyObject* HeadingEnd;
PyObject* CommentStart;
PyObject* CommentEnd;
PyObject* TagOpenOpen;
PyObject* TagAttrStart;
PyObject* TagAttrEquals;
PyObject* TagAttrQuote;
PyObject* TagCloseOpen;
PyObject* TagCloseSelfclose;
PyObject* TagOpenClose;
PyObject* TagCloseClose;
PyObject *Text;
PyObject *TemplateOpen;
PyObject *TemplateParamSeparator;
PyObject *TemplateParamEquals;
PyObject *TemplateClose;
PyObject *ArgumentOpen;
PyObject *ArgumentSeparator;
PyObject *ArgumentClose;
PyObject *WikilinkOpen;
PyObject *WikilinkSeparator;
PyObject *WikilinkClose;
PyObject *ExternalLinkOpen;
PyObject *ExternalLinkSeparator;
PyObject *ExternalLinkClose;
PyObject *HTMLEntityStart;
PyObject *HTMLEntityNumeric;
PyObject *HTMLEntityHex;
PyObject *HTMLEntityEnd;
PyObject *HeadingStart;
PyObject *HeadingEnd;
PyObject *CommentStart;
PyObject *CommentEnd;
PyObject *TagOpenOpen;
PyObject *TagAttrStart;
PyObject *TagAttrEquals;
PyObject *TagAttrQuote;
PyObject *TagCloseOpen;
PyObject *TagCloseSelfclose;
PyObject *TagOpenClose;
PyObject *TagCloseClose;


/* /*
Load individual tokens into globals from the given Python module object. Load individual tokens into globals from the given Python module object.
*/ */
void load_tokens_from_module(PyObject* module)
void
load_tokens_from_module(PyObject *module)
{ {
Text = PyObject_GetAttrString(module, "Text"); Text = PyObject_GetAttrString(module, "Text");


TemplateOpen = PyObject_GetAttrString(module, "TemplateOpen"); TemplateOpen = PyObject_GetAttrString(module, "TemplateOpen");
TemplateParamSeparator = PyObject_GetAttrString(module,
"TemplateParamSeparator");
TemplateParamEquals = PyObject_GetAttrString(module,
"TemplateParamEquals");
TemplateParamSeparator = PyObject_GetAttrString(module, "TemplateParamSeparator");
TemplateParamEquals = PyObject_GetAttrString(module, "TemplateParamEquals");
TemplateClose = PyObject_GetAttrString(module, "TemplateClose"); TemplateClose = PyObject_GetAttrString(module, "TemplateClose");


ArgumentOpen = PyObject_GetAttrString(module, "ArgumentOpen"); ArgumentOpen = PyObject_GetAttrString(module, "ArgumentOpen");
@@ -85,8 +84,7 @@ void load_tokens_from_module(PyObject* module)
WikilinkClose = PyObject_GetAttrString(module, "WikilinkClose"); WikilinkClose = PyObject_GetAttrString(module, "WikilinkClose");


ExternalLinkOpen = PyObject_GetAttrString(module, "ExternalLinkOpen"); ExternalLinkOpen = PyObject_GetAttrString(module, "ExternalLinkOpen");
ExternalLinkSeparator = PyObject_GetAttrString(module,
"ExternalLinkSeparator");
ExternalLinkSeparator = PyObject_GetAttrString(module, "ExternalLinkSeparator");
ExternalLinkClose = PyObject_GetAttrString(module, "ExternalLinkClose"); ExternalLinkClose = PyObject_GetAttrString(module, "ExternalLinkClose");


HTMLEntityStart = PyObject_GetAttrString(module, "HTMLEntityStart"); HTMLEntityStart = PyObject_GetAttrString(module, "HTMLEntityStart");


+ 38
- 38
src/mwparserfromhell/parser/ctokenizer/tokens.h View File

@@ -26,44 +26,44 @@ SOFTWARE.


/* Token globals */ /* Token globals */


extern PyObject* Text;
extern PyObject* TemplateOpen;
extern PyObject* TemplateParamSeparator;
extern PyObject* TemplateParamEquals;
extern PyObject* TemplateClose;
extern PyObject* ArgumentOpen;
extern PyObject* ArgumentSeparator;
extern PyObject* ArgumentClose;
extern PyObject* WikilinkOpen;
extern PyObject* WikilinkSeparator;
extern PyObject* WikilinkClose;
extern PyObject* ExternalLinkOpen;
extern PyObject* ExternalLinkSeparator;
extern PyObject* ExternalLinkClose;
extern PyObject* HTMLEntityStart;
extern PyObject* HTMLEntityNumeric;
extern PyObject* HTMLEntityHex;
extern PyObject* HTMLEntityEnd;
extern PyObject* HeadingStart;
extern PyObject* HeadingEnd;
extern PyObject* CommentStart;
extern PyObject* CommentEnd;
extern PyObject* TagOpenOpen;
extern PyObject* TagAttrStart;
extern PyObject* TagAttrEquals;
extern PyObject* TagAttrQuote;
extern PyObject* TagCloseOpen;
extern PyObject* TagCloseSelfclose;
extern PyObject* TagOpenClose;
extern PyObject* TagCloseClose;
extern PyObject *Text;
extern PyObject *TemplateOpen;
extern PyObject *TemplateParamSeparator;
extern PyObject *TemplateParamEquals;
extern PyObject *TemplateClose;
extern PyObject *ArgumentOpen;
extern PyObject *ArgumentSeparator;
extern PyObject *ArgumentClose;
extern PyObject *WikilinkOpen;
extern PyObject *WikilinkSeparator;
extern PyObject *WikilinkClose;
extern PyObject *ExternalLinkOpen;
extern PyObject *ExternalLinkSeparator;
extern PyObject *ExternalLinkClose;
extern PyObject *HTMLEntityStart;
extern PyObject *HTMLEntityNumeric;
extern PyObject *HTMLEntityHex;
extern PyObject *HTMLEntityEnd;
extern PyObject *HeadingStart;
extern PyObject *HeadingEnd;
extern PyObject *CommentStart;
extern PyObject *CommentEnd;
extern PyObject *TagOpenOpen;
extern PyObject *TagAttrStart;
extern PyObject *TagAttrEquals;
extern PyObject *TagAttrQuote;
extern PyObject *TagCloseOpen;
extern PyObject *TagCloseSelfclose;
extern PyObject *TagOpenClose;
extern PyObject *TagCloseClose;


/* Functions */ /* Functions */


void load_tokens_from_module(PyObject*);
void load_tokens_from_module(PyObject *);

+ 2
- 0
src/mwparserfromhell/parser/errors.py View File

@@ -20,6 +20,7 @@


__all__ = ["ParserError"] __all__ = ["ParserError"]



class ParserError(Exception): class ParserError(Exception):
"""Exception raised when an internal error occurs while parsing. """Exception raised when an internal error occurs while parsing.


@@ -28,6 +29,7 @@ class ParserError(Exception):
with an impossible internal state and is bailing out before other problems with an impossible internal state and is bailing out before other problems
can happen. Its appearance indicates a bug. can happen. Its appearance indicates a bug.
""" """

def __init__(self, extra): def __init__(self, extra):
msg = "This is a bug and should be reported. Info: {}.".format(extra) msg = "This is a bug and should be reported. Info: {}.".format(extra)
super().__init__(msg) super().__init__(msg)

+ 126
- 52
src/mwparserfromhell/parser/tokenizer.py View File

@@ -24,11 +24,17 @@ import re


from . import contexts, tokens from . import contexts, tokens
from .errors import ParserError from .errors import ParserError
from ..definitions import (get_html_tag, is_parsable, is_single,
is_single_only, is_scheme)
from ..definitions import (
get_html_tag,
is_parsable,
is_single,
is_single_only,
is_scheme,
)


__all__ = ["Tokenizer"] __all__ = ["Tokenizer"]



class BadRoute(Exception): class BadRoute(Exception):
"""Raised internally when the current tokenization route is invalid.""" """Raised internally when the current tokenization route is invalid."""


@@ -39,14 +45,15 @@ class BadRoute(Exception):


class _TagOpenData: class _TagOpenData:
"""Stores data about an HTML open tag, like ``<ref name="foo">``.""" """Stores data about an HTML open tag, like ``<ref name="foo">``."""
CX_NAME = 1 << 0
CX_ATTR_READY = 1 << 1
CX_ATTR_NAME = 1 << 2
CX_ATTR_VALUE = 1 << 3
CX_QUOTED = 1 << 4
CX_NOTE_SPACE = 1 << 5

CX_NAME = 1 << 0
CX_ATTR_READY = 1 << 1
CX_ATTR_NAME = 1 << 2
CX_ATTR_VALUE = 1 << 3
CX_QUOTED = 1 << 4
CX_NOTE_SPACE = 1 << 5
CX_NOTE_EQUALS = 1 << 6 CX_NOTE_EQUALS = 1 << 6
CX_NOTE_QUOTE = 1 << 7
CX_NOTE_QUOTE = 1 << 7


def __init__(self): def __init__(self):
self.context = self.CX_NAME self.context = self.CX_NAME
@@ -57,11 +64,33 @@ class _TagOpenData:


class Tokenizer: class Tokenizer:
"""Creates a list of tokens from a string of wikicode.""" """Creates a list of tokens from a string of wikicode."""

USES_C = False USES_C = False
START = object() START = object()
END = object() END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", '"', "#", "*", ";",
":", "/", "-", "!", "\n", START, END]
MARKERS = [
"{",
"}",
"[",
"]",
"<",
">",
"|",
"=",
"&",
"'",
'"',
"#",
"*",
";",
":",
"/",
"-",
"!",
"\n",
START,
END,
]
URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
MAX_DEPTH = 40 MAX_DEPTH = 40
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
@@ -437,13 +466,15 @@ class Tokenizer:
"""Return whether the current head is the end of a URI.""" """Return whether the current head is the end of a URI."""
# Built from _parse()'s end sentinels: # Built from _parse()'s end sentinels:
after, ctx = self._read(2), self._context after, ctx = self._read(2), self._context
return (this in (self.END, "\n", "[", "]", "<", ">", '"') or
" " in this or
this == nxt == "'" or
(this == "|" and ctx & contexts.TEMPLATE) or
(this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING)) or
(this == nxt == "}" and ctx & contexts.TEMPLATE) or
(this == nxt == after == "}" and ctx & contexts.ARGUMENT))
return (
this in (self.END, "\n", "[", "]", "<", ">", '"')
or " " in this
or this == nxt == "'"
or (this == "|" and ctx & contexts.TEMPLATE)
or (this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING))
or (this == nxt == "}" and ctx & contexts.TEMPLATE)
or (this == nxt == after == "}" and ctx & contexts.ARGUMENT)
)


def _really_parse_external_link(self, brackets): def _really_parse_external_link(self, brackets):
"""Really parse an external link.""" """Really parse an external link."""
@@ -681,9 +712,13 @@ class Tokenizer:
self._emit_first(tokens.TagAttrQuote(char=data.quoter)) self._emit_first(tokens.TagAttrQuote(char=data.quoter))
self._emit_all(self._pop()) self._emit_all(self._pop())
buf = data.padding_buffer buf = data.padding_buffer
self._emit_first(tokens.TagAttrStart(
pad_first=buf["first"], pad_before_eq=buf["before_eq"],
pad_after_eq=buf["after_eq"]))
self._emit_first(
tokens.TagAttrStart(
pad_first=buf["first"],
pad_before_eq=buf["before_eq"],
pad_after_eq=buf["after_eq"],
)
)
self._emit_all(self._pop()) self._emit_all(self._pop())
for key in data.padding_buffer: for key in data.padding_buffer:
data.padding_buffer[key] = "" data.padding_buffer[key] = ""
@@ -691,7 +726,9 @@ class Tokenizer:
def _handle_tag_space(self, data, text): def _handle_tag_space(self, data, text):
"""Handle whitespace (*text*) inside of an HTML open tag.""" """Handle whitespace (*text*) inside of an HTML open tag."""
ctx = data.context ctx = data.context
end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NOTE_QUOTE)
end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (
data.CX_QUOTED | data.CX_NOTE_QUOTE
)
if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE): if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE):
self._push_tag_buffer(data) self._push_tag_buffer(data)
data.context = data.CX_ATTR_READY data.context = data.CX_ATTR_READY
@@ -792,8 +829,10 @@ class Tokenizer:
"""Handle the ending of a closing tag (``</foo>``).""" """Handle the ending of a closing tag (``</foo>``)."""
strip = lambda tok: tok.text.rstrip().lower() strip = lambda tok: tok.text.rstrip().lower()
closing = self._pop() closing = self._pop()
if len(closing) != 1 or (not isinstance(closing[0], tokens.Text) or
strip(closing[0]) != strip(self._stack[1])):
if len(closing) != 1 or (
not isinstance(closing[0], tokens.Text)
or strip(closing[0]) != strip(self._stack[1])
):
self._fail_route() self._fail_route()
self._emit_all(closing) self._emit_all(closing)
self._emit(tokens.TagCloseClose()) self._emit(tokens.TagCloseClose())
@@ -808,8 +847,9 @@ class Tokenizer:
self._fail_route() self._fail_route()
elif this == "<" and nxt == "/": elif this == "<" and nxt == "/":
self._head += 3 self._head += 3
if self._read() != ">" or (strip(self._read(-1)) !=
strip(self._stack[1].text)):
if self._read() != ">" or (
strip(self._read(-1)) != strip(self._stack[1].text)
):
self._head -= 1 self._head -= 1
self._emit_text("</") self._emit_text("</")
continue continue
@@ -862,8 +902,10 @@ class Tokenizer:
self._emit(tokens.TagOpenOpen()) self._emit(tokens.TagOpenOpen())
while True: while True:
this, nxt = self._read(), self._read(1) this, nxt = self._read(), self._read(1)
can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
data.context & data.CX_NOTE_SPACE)
can_exit = (
not data.context & (data.CX_QUOTED | data.CX_NAME)
or data.context & data.CX_NOTE_SPACE
)
if this is self.END: if this is self.END:
if self._context & contexts.TAG_ATTR: if self._context & contexts.TAG_ATTR:
if data.context & data.CX_QUOTED: if data.context & data.CX_QUOTED:
@@ -1079,16 +1121,25 @@ class Tokenizer:
else: else:
self._emit_text("\n") self._emit_text("\n")


def _emit_table_tag(self, open_open_markup, tag, style, padding,
close_open_markup, contents, open_close_markup):
def _emit_table_tag(
self,
open_open_markup,
tag,
style,
padding,
close_open_markup,
contents,
open_close_markup,
):
"""Emit a table tag.""" """Emit a table tag."""
self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup)) self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup))
self._emit_text(tag) self._emit_text(tag)
if style: if style:
self._emit_all(style) self._emit_all(style)
if close_open_markup: if close_open_markup:
self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup,
padding=padding))
self._emit(
tokens.TagCloseOpen(wiki_markup=close_open_markup, padding=padding)
)
else: else:
self._emit(tokens.TagCloseOpen(padding=padding)) self._emit(tokens.TagCloseOpen(padding=padding))
if contents: if contents:
@@ -1103,8 +1154,9 @@ class Tokenizer:
data.context = _TagOpenData.CX_ATTR_READY data.context = _TagOpenData.CX_ATTR_READY
while True: while True:
this = self._read() this = self._read()
can_exit = (not data.context & data.CX_QUOTED or
data.context & data.CX_NOTE_SPACE)
can_exit = (
not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE
)
if this == end_token and can_exit: if this == end_token and can_exit:
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
self._push_tag_buffer(data) self._push_tag_buffer(data)
@@ -1187,30 +1239,34 @@ class Tokenizer:
self._head -= 1 self._head -= 1
return return


cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context | contexts.TABLE_CELL_STYLE)
cell = self._parse(
contexts.TABLE_OPEN
| contexts.TABLE_CELL_OPEN
| line_context
| contexts.TABLE_CELL_STYLE
)
cell_context = self._context cell_context = self._context
self._context = old_context self._context = old_context
reset_for_style = cell_context & contexts.TABLE_CELL_STYLE reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
if reset_for_style: if reset_for_style:
self._head = reset self._head = reset
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context)
padding = self._handle_table_style("|") padding = self._handle_table_style("|")
style = self._pop() style = self._pop()
# Don't parse the style separator: # Don't parse the style separator:
self._head += 1 self._head += 1
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
cell = self._parse(
contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context
)
cell_context = self._context cell_context = self._context
self._context = old_context self._context = old_context


close_open_markup = "|" if reset_for_style else None close_open_markup = "|" if reset_for_style else None
self._emit_table_tag(markup, tag, style, padding, close_open_markup,
cell, "")
self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "")
# Keep header/cell line contexts: # Keep header/cell line contexts:
self._context |= cell_context & (contexts.TABLE_TH_LINE |
contexts.TABLE_TD_LINE)
self._context |= cell_context & (
contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE
)
# Offset displacement done by parse(): # Offset displacement done by parse():
self._head -= 1 self._head -= 1


@@ -1333,7 +1389,11 @@ class Tokenizer:
elif this == "|" and self._context & contexts.TEMPLATE: elif this == "|" and self._context & contexts.TEMPLATE:
self._handle_template_param() self._handle_template_param()
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
if not self._global & contexts.GL_HEADING and self._read(-1) in ("\n", self.START) and nxt == "=":
if (
not self._global & contexts.GL_HEADING
and self._read(-1) in ("\n", self.START)
and nxt == "="
):
self._parse_heading() self._parse_heading()
else: else:
self._handle_template_param_value() self._handle_template_param_value()
@@ -1362,7 +1422,11 @@ class Tokenizer:
self._parse_external_link(False) self._parse_external_link(False)
elif this == "]" and self._context & contexts.EXT_LINK_TITLE: elif this == "]" and self._context & contexts.EXT_LINK_TITLE:
return self._pop() return self._pop()
elif this == "=" and not self._global & contexts.GL_HEADING and not self._context & contexts.TEMPLATE:
elif (
this == "="
and not self._global & contexts.GL_HEADING
and not self._context & contexts.TEMPLATE
):
if self._read(-1) in ("\n", self.START): if self._read(-1) in ("\n", self.START):
self._parse_heading() self._parse_heading()
else: else:
@@ -1397,7 +1461,8 @@ class Tokenizer:
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
self._handle_list() self._handle_list()
elif self._read(-1) in ("\n", self.START) and ( elif self._read(-1) in ("\n", self.START) and (
this == nxt == self._read(2) == self._read(3) == "-"):
this == nxt == self._read(2) == self._read(3) == "-"
):
self._handle_hr() self._handle_hr()
elif this in ("\n", ":") and self._context & contexts.DL_TERM: elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term() self._handle_dl_term()
@@ -1405,9 +1470,17 @@ class Tokenizer:
# Kill potential table contexts # Kill potential table contexts
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
# Start of table parsing # Start of table parsing
elif this == "{" and nxt == "|" and (
self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
elif (
this == "{"
and nxt == "|"
and (
self._read(-1) in ("\n", self.START)
or (
self._read(-2) in ("\n", self.START)
and self._read(-1).isspace()
)
)
):
if self._can_recurse(): if self._can_recurse():
self._parse_table() self._parse_table()
else: else:
@@ -1431,8 +1504,9 @@ class Tokenizer:
elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS:
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
self._emit_text(this) self._emit_text(this)
elif (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
elif self._read(-1) in ("\n", self.START) or (
self._read(-2) in ("\n", self.START) and self._read(-1).isspace()
):
if this == "|" and nxt == "}": if this == "|" and nxt == "}":
if self._context & contexts.TABLE_CELL_OPEN: if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end() return self._handle_table_cell_end()


+ 30
- 28
src/mwparserfromhell/parser/tokens.py View File

@@ -28,6 +28,7 @@ the :class`.Wikicode` tree by the :class:`.Builder`.


__all__ = ["Token"] __all__ = ["Token"]



class Token(dict): class Token(dict):
"""A token stores the semantic meaning of a unit of wikicode.""" """A token stores the semantic meaning of a unit of wikicode."""


@@ -61,43 +62,44 @@ def make(name):
__all__.append(name) __all__.append(name)
return type(name, (Token,), {}) return type(name, (Token,), {})



Text = make("Text") Text = make("Text")


TemplateOpen = make("TemplateOpen") # {{
TemplateParamSeparator = make("TemplateParamSeparator") # |
TemplateParamEquals = make("TemplateParamEquals") # =
TemplateClose = make("TemplateClose") # }}
TemplateOpen = make("TemplateOpen") # {{
TemplateParamSeparator = make("TemplateParamSeparator") # |
TemplateParamEquals = make("TemplateParamEquals") # =
TemplateClose = make("TemplateClose") # }}


ArgumentOpen = make("ArgumentOpen") # {{{
ArgumentSeparator = make("ArgumentSeparator") # |
ArgumentClose = make("ArgumentClose") # }}}
ArgumentOpen = make("ArgumentOpen") # {{{
ArgumentSeparator = make("ArgumentSeparator") # |
ArgumentClose = make("ArgumentClose") # }}}


WikilinkOpen = make("WikilinkOpen") # [[
WikilinkSeparator = make("WikilinkSeparator") # |
WikilinkClose = make("WikilinkClose") # ]]
WikilinkOpen = make("WikilinkOpen") # [[
WikilinkSeparator = make("WikilinkSeparator") # |
WikilinkClose = make("WikilinkClose") # ]]


ExternalLinkOpen = make("ExternalLinkOpen") # [
ExternalLinkSeparator = make("ExternalLinkSeparator") #
ExternalLinkClose = make("ExternalLinkClose") # ]
ExternalLinkOpen = make("ExternalLinkOpen") # [
ExternalLinkSeparator = make("ExternalLinkSeparator") #
ExternalLinkClose = make("ExternalLinkClose") # ]


HTMLEntityStart = make("HTMLEntityStart") # &
HTMLEntityNumeric = make("HTMLEntityNumeric") # #
HTMLEntityHex = make("HTMLEntityHex") # x
HTMLEntityEnd = make("HTMLEntityEnd") # ;
HTMLEntityStart = make("HTMLEntityStart") # &
HTMLEntityNumeric = make("HTMLEntityNumeric") # #
HTMLEntityHex = make("HTMLEntityHex") # x
HTMLEntityEnd = make("HTMLEntityEnd") # ;


HeadingStart = make("HeadingStart") # =...
HeadingEnd = make("HeadingEnd") # =...
HeadingStart = make("HeadingStart") # =...
HeadingEnd = make("HeadingEnd") # =...


CommentStart = make("CommentStart") # <!--
CommentEnd = make("CommentEnd") # -->
CommentStart = make("CommentStart") # <!--
CommentEnd = make("CommentEnd") # -->


TagOpenOpen = make("TagOpenOpen") # <
TagOpenOpen = make("TagOpenOpen") # <
TagAttrStart = make("TagAttrStart") TagAttrStart = make("TagAttrStart")
TagAttrEquals = make("TagAttrEquals") # =
TagAttrQuote = make("TagAttrQuote") # ", '
TagCloseOpen = make("TagCloseOpen") # >
TagCloseSelfclose = make("TagCloseSelfclose") # />
TagOpenClose = make("TagOpenClose") # </
TagCloseClose = make("TagCloseClose") # >
TagAttrEquals = make("TagAttrEquals") # =
TagAttrQuote = make("TagAttrQuote") # ", '
TagCloseOpen = make("TagCloseOpen") # >
TagCloseSelfclose = make("TagCloseSelfclose") # />
TagOpenClose = make("TagOpenClose") # </
TagCloseClose = make("TagCloseClose") # >


del make del make

+ 4
- 4
src/mwparserfromhell/smart_list/list_proxy.py View File

@@ -167,7 +167,7 @@ class ListProxy(_SliceNormalizerMixIn, list):


def _render(self): def _render(self):
"""Return the actual list from the stored start/stop/step.""" """Return the actual list from the stored start/stop/step."""
return list(self._parent)[self._start:self._stop:self._step]
return list(self._parent)[self._start : self._stop : self._step]


@inheritdoc @inheritdoc
def append(self, item): def append(self, item):
@@ -187,7 +187,7 @@ class ListProxy(_SliceNormalizerMixIn, list):


@inheritdoc @inheritdoc
def extend(self, item): def extend(self, item):
self._parent[self._stop:self._stop] = item
self._parent[self._stop : self._stop] = item


@inheritdoc @inheritdoc
def insert(self, index, item): def insert(self, index, item):
@@ -215,7 +215,7 @@ class ListProxy(_SliceNormalizerMixIn, list):
def reverse(self): def reverse(self):
item = self._render() item = self._render()
item.reverse() item.reverse()
self._parent[self._start:self._stop:self._step] = item
self._parent[self._start : self._stop : self._step] = item


@inheritdoc @inheritdoc
def sort(self, key=None, reverse=None): def sort(self, key=None, reverse=None):
@@ -226,4 +226,4 @@ class ListProxy(_SliceNormalizerMixIn, list):
if reverse is not None: if reverse is not None:
kwargs["reverse"] = reverse kwargs["reverse"] = reverse
item.sort(**kwargs) item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item
self._parent[self._start : self._stop : self._step] = item

+ 5
- 2
src/mwparserfromhell/string_mixin.py View File

@@ -27,6 +27,7 @@ from sys import getdefaultencoding


__all__ = ["StringMixIn"] __all__ = ["StringMixIn"]



def inheritdoc(method): def inheritdoc(method):
"""Set __doc__ of *method* to __doc__ of *method* in its parent class. """Set __doc__ of *method* to __doc__ of *method* in its parent class.


@@ -36,6 +37,7 @@ def inheritdoc(method):
method.__doc__ = getattr(str, method.__name__).__doc__ method.__doc__ = getattr(str, method.__name__).__doc__
return method return method



class StringMixIn: class StringMixIn:
"""Implement the interface for ``str`` in a dynamic manner. """Implement the interface for ``str`` in a dynamic manner.


@@ -92,8 +94,9 @@ class StringMixIn:


def __getattr__(self, attr): def __getattr__(self, attr):
if not hasattr(str, attr): if not hasattr(str, attr):
raise AttributeError("{!r} object has no attribute {!r}".format(
type(self).__name__, attr))
raise AttributeError(
"{!r} object has no attribute {!r}".format(type(self).__name__, attr)
)
return getattr(self.__str__(), attr) return getattr(self.__str__(), attr)


maketrans = str.maketrans # Static method can't rely on __getattr__ maketrans = str.maketrans # Static method can't rely on __getattr__


+ 5
- 2
src/mwparserfromhell/utils.py View File

@@ -25,6 +25,7 @@ users generally won't need stuff from here.


__all__ = ["parse_anything"] __all__ = ["parse_anything"]



def parse_anything(value, context=0, skip_style_tags=False): def parse_anything(value, context=0, skip_style_tags=False):
"""Return a :class:`.Wikicode` for *value*, allowing multiple types. """Return a :class:`.Wikicode` for *value*, allowing multiple types.


@@ -64,6 +65,8 @@ def parse_anything(value, context=0, skip_style_tags=False):
nodelist += parse_anything(item, context, skip_style_tags).nodes nodelist += parse_anything(item, context, skip_style_tags).nodes
return Wikicode(nodelist) return Wikicode(nodelist)
except TypeError as exc: except TypeError as exc:
error = ("Needs string, Node, Wikicode, file, int, None, or "
"iterable of these, but got {0}: {1}")
error = (
"Needs string, Node, Wikicode, file, int, None, or "
"iterable of these, but got {0}: {1}"
)
raise ValueError(error.format(type(value).__name__, value)) from exc raise ValueError(error.format(type(value).__name__, value)) from exc

+ 57
- 22
src/mwparserfromhell/wikicode.py View File

@@ -21,8 +21,18 @@
import re import re
from itertools import chain from itertools import chain


from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity,
Node, Tag, Template, Text, Wikilink)
from .nodes import (
Argument,
Comment,
ExternalLink,
Heading,
HTMLEntity,
Node,
Tag,
Template,
Text,
Wikilink,
)
from .smart_list.list_proxy import ListProxy from .smart_list.list_proxy import ListProxy
from .string_mixin import StringMixIn from .string_mixin import StringMixIn
from .utils import parse_anything from .utils import parse_anything
@@ -31,6 +41,7 @@ __all__ = ["Wikicode"]


FLAGS = re.IGNORECASE | re.DOTALL | re.UNICODE FLAGS = re.IGNORECASE | re.DOTALL | re.UNICODE



class Wikicode(StringMixIn): class Wikicode(StringMixIn):
"""A ``Wikicode`` is a container for nodes that operates like a string. """A ``Wikicode`` is a container for nodes that operates like a string.


@@ -41,6 +52,7 @@ class Wikicode(StringMixIn):
<ifilter>` series of functions is very useful for extracting and iterating <ifilter>` series of functions is very useful for extracting and iterating
over, for example, all of the templates in the object. over, for example, all of the templates in the object.
""" """

RECURSE_OTHERS = 2 RECURSE_OTHERS = 2


def __init__(self, nodes): def __init__(self, nodes):
@@ -82,8 +94,9 @@ class Wikicode(StringMixIn):
return lambda obj: re.search(matches, str(obj), flags) return lambda obj: re.search(matches, str(obj), flags)
return lambda obj: True return lambda obj: True


def _indexed_ifilter(self, recursive=True, matches=None, flags=FLAGS,
forcetype=None):
def _indexed_ifilter(
self, recursive=True, matches=None, flags=FLAGS, forcetype=None
):
"""Iterate over nodes and their corresponding indices in the node list. """Iterate over nodes and their corresponding indices in the node list.


The arguments are interpreted as for :meth:`ifilter`. For each tuple The arguments are interpreted as for :meth:`ifilter`. For each tuple
@@ -94,9 +107,11 @@ class Wikicode(StringMixIn):
match = self._build_matcher(matches, flags) match = self._build_matcher(matches, flags)
if recursive: if recursive:
restrict = forcetype if recursive == self.RECURSE_OTHERS else None restrict = forcetype if recursive == self.RECURSE_OTHERS else None

def getter(i, node): def getter(i, node):
for ch in self._get_children(node, restrict=restrict): for ch in self._get_children(node, restrict=restrict):
yield (i, ch) yield (i, ch)

inodes = chain(*(getter(i, n) for i, n in enumerate(self.nodes))) inodes = chain(*(getter(i, n) for i, n in enumerate(self.nodes)))
else: else:
inodes = enumerate(self.nodes) inodes = enumerate(self.nodes)
@@ -106,6 +121,7 @@ class Wikicode(StringMixIn):


def _is_child_wikicode(self, obj, recursive=True): def _is_child_wikicode(self, obj, recursive=True):
"""Return whether the given :class:`.Wikicode` is a descendant.""" """Return whether the given :class:`.Wikicode` is a descendant."""

def deref(nodes): def deref(nodes):
if isinstance(nodes, ListProxy): if isinstance(nodes, ListProxy):
return nodes._parent # pylint: disable=protected-access return nodes._parent # pylint: disable=protected-access
@@ -210,6 +226,7 @@ class Wikicode(StringMixIn):
should be any object that can be tested for with ``is``. *indent* is should be any object that can be tested for with ``is``. *indent* is
the starting indentation. the starting indentation.
""" """

def write(*args): def write(*args):
"""Write a new line following the proper indentation rules.""" """Write a new line following the proper indentation rules."""
if lines and lines[-1] is marker: # Continue from the last line if lines and lines[-1] is marker: # Continue from the last line
@@ -243,10 +260,12 @@ class Wikicode(StringMixIn):
This is equivalent to :meth:`{1}` with *forcetype* set to This is equivalent to :meth:`{1}` with *forcetype* set to
:class:`~{2.__module__}.{2.__name__}`. :class:`~{2.__module__}.{2.__name__}`.
""" """
make_ifilter = lambda ftype: (lambda self, *a, **kw:
self.ifilter(forcetype=ftype, *a, **kw))
make_filter = lambda ftype: (lambda self, *a, **kw:
self.filter(forcetype=ftype, *a, **kw))
make_ifilter = lambda ftype: (
lambda self, *a, **kw: self.ifilter(forcetype=ftype, *a, **kw)
)
make_filter = lambda ftype: (
lambda self, *a, **kw: self.filter(forcetype=ftype, *a, **kw)
)
for name, ftype in meths.items(): for name, ftype in meths.items():
ifilt = make_ifilter(ftype) ifilt = make_ifilter(ftype)
filt = make_filter(ftype) filt = make_filter(ftype)
@@ -342,6 +361,7 @@ class Wikicode(StringMixIn):
Will return an empty list if *obj* is at the top level of this Wikicode Will return an empty list if *obj* is at the top level of this Wikicode
object. Will raise :exc:`ValueError` if it wasn't found. object. Will raise :exc:`ValueError` if it wasn't found.
""" """

def _get_ancestors(code, needle): def _get_ancestors(code, needle):
for node in code.nodes: for node in code.nodes:
if node is needle: if node is needle:
@@ -510,8 +530,7 @@ class Wikicode(StringMixIn):
return True return True
return False return False


def ifilter(self, recursive=True, matches=None, flags=FLAGS,
forcetype=None):
def ifilter(self, recursive=True, matches=None, flags=FLAGS, forcetype=None):
"""Iterate over nodes in our list matching certain conditions. """Iterate over nodes in our list matching certain conditions.


If *forcetype* is given, only nodes that are instances of this type (or If *forcetype* is given, only nodes that are instances of this type (or
@@ -545,8 +564,15 @@ class Wikicode(StringMixIn):
""" """
return list(self.ifilter(*args, **kwargs)) return list(self.ifilter(*args, **kwargs))


def get_sections(self, levels=None, matches=None, flags=FLAGS, flat=False,
include_lead=None, include_headings=True):
def get_sections(
self,
levels=None,
matches=None,
flags=FLAGS,
flat=False,
include_lead=None,
include_headings=True,
):
"""Return a list of sections within the page. """Return a list of sections within the page.


Sections are returned as :class:`.Wikicode` objects with a shared node Sections are returned as :class:`.Wikicode` objects with a shared node
@@ -568,12 +594,14 @@ class Wikicode(StringMixIn):
:class:`.Heading` object will be included; otherwise, this is skipped. :class:`.Heading` object will be included; otherwise, this is skipped.
""" """
title_matcher = self._build_matcher(matches, flags) title_matcher = self._build_matcher(matches, flags)
matcher = lambda heading: (title_matcher(heading.title) and
(not levels or heading.level in levels))
matcher = lambda heading: (
title_matcher(heading.title) and (not levels or heading.level in levels)
)
iheadings = self._indexed_ifilter(recursive=False, forcetype=Heading) iheadings = self._indexed_ifilter(recursive=False, forcetype=Heading)
sections = [] # Tuples of (index_of_first_node, section) sections = [] # Tuples of (index_of_first_node, section)
open_headings = [] # Tuples of (index, heading), where index and
# heading.level are both monotonically increasing
# Tuples of (index, heading), where index and heading.level are both
# monotonically increasing
open_headings = []


# Add the lead section if appropriate: # Add the lead section if appropriate:
if include_lead or not (include_lead is not None or matches or levels): if include_lead or not (include_lead is not None or matches or levels):
@@ -610,8 +638,7 @@ class Wikicode(StringMixIn):
# Ensure that earlier sections are earlier in the returned list: # Ensure that earlier sections are earlier in the returned list:
return [section for i, section in sorted(sections)] return [section for i, section in sorted(sections)]


def strip_code(self, normalize=True, collapse=True,
keep_template_params=False):
def strip_code(self, normalize=True, collapse=True, keep_template_params=False):
"""Return a rendered string without unprintable code such as templates. """Return a rendered string without unprintable code such as templates.


The way a node is stripped is handled by the The way a node is stripped is handled by the
@@ -631,7 +658,7 @@ class Wikicode(StringMixIn):
kwargs = { kwargs = {
"normalize": normalize, "normalize": normalize,
"collapse": collapse, "collapse": collapse,
"keep_template_params": keep_template_params
"keep_template_params": keep_template_params,
} }


nodes = [] nodes = []
@@ -673,7 +700,15 @@ class Wikicode(StringMixIn):
marker = object() # Random object we can find with certainty in a list marker = object() # Random object we can find with certainty in a list
return "\n".join(self._get_tree(self, [], marker, 0)) return "\n".join(self._get_tree(self, [], marker, 0))



Wikicode._build_filter_methods( Wikicode._build_filter_methods(
arguments=Argument, comments=Comment, external_links=ExternalLink,
headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template,
text=Text, wikilinks=Wikilink)
arguments=Argument,
comments=Comment,
external_links=ExternalLink,
headings=Heading,
html_entities=HTMLEntity,
tags=Tag,
templates=Template,
text=Text,
wikilinks=Wikilink,
)

+ 22
- 2
tests/conftest.py View File

@@ -18,14 +18,24 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading,
HTMLEntity, Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes import (
Argument,
Comment,
ExternalLink,
Heading,
HTMLEntity,
Tag,
Template,
Text,
Wikilink,
)
from mwparserfromhell.smart_list import SmartList from mwparserfromhell.smart_list import SmartList
from mwparserfromhell.wikicode import Wikicode from mwparserfromhell.wikicode import Wikicode


wrap = lambda L: Wikicode(SmartList(L)) wrap = lambda L: Wikicode(SmartList(L))
wraptext = lambda *args: wrap([Text(t) for t in args]) wraptext = lambda *args: wrap([Text(t) for t in args])



def _assert_node_equal(expected, actual): def _assert_node_equal(expected, actual):
"""Assert that two Nodes have the same type and have the same data.""" """Assert that two Nodes have the same type and have the same data."""
registry = { registry = {
@@ -43,6 +53,7 @@ def _assert_node_equal(expected, actual):
assert type(expected) == type(actual) assert type(expected) == type(actual)
registry[type(expected)](expected, actual) registry[type(expected)](expected, actual)



def _assert_argument_node_equal(expected, actual): def _assert_argument_node_equal(expected, actual):
"""Assert that two Argument nodes have the same data.""" """Assert that two Argument nodes have the same data."""
assert_wikicode_equal(expected.name, actual.name) assert_wikicode_equal(expected.name, actual.name)
@@ -51,10 +62,12 @@ def _assert_argument_node_equal(expected, actual):
else: else:
assert actual.default is None assert actual.default is None



def _assert_comment_node_equal(expected, actual): def _assert_comment_node_equal(expected, actual):
"""Assert that two Comment nodes have the same data.""" """Assert that two Comment nodes have the same data."""
assert expected.contents == actual.contents assert expected.contents == actual.contents



def _assert_external_link_node_equal(expected, actual): def _assert_external_link_node_equal(expected, actual):
"""Assert that two ExternalLink nodes have the same data.""" """Assert that two ExternalLink nodes have the same data."""
assert_wikicode_equal(expected.url, actual.url) assert_wikicode_equal(expected.url, actual.url)
@@ -65,11 +78,13 @@ def _assert_external_link_node_equal(expected, actual):
assert expected.brackets is actual.brackets assert expected.brackets is actual.brackets
assert expected.suppress_space is actual.suppress_space assert expected.suppress_space is actual.suppress_space



def _assert_heading_node_equal(expected, actual): def _assert_heading_node_equal(expected, actual):
"""Assert that two Heading nodes have the same data.""" """Assert that two Heading nodes have the same data."""
assert_wikicode_equal(expected.title, actual.title) assert_wikicode_equal(expected.title, actual.title)
assert expected.level == actual.level assert expected.level == actual.level



def _assert_html_entity_node_equal(expected, actual): def _assert_html_entity_node_equal(expected, actual):
"""Assert that two HTMLEntity nodes have the same data.""" """Assert that two HTMLEntity nodes have the same data."""
assert expected.value == actual.value assert expected.value == actual.value
@@ -77,6 +92,7 @@ def _assert_html_entity_node_equal(expected, actual):
assert expected.hexadecimal is actual.hexadecimal assert expected.hexadecimal is actual.hexadecimal
assert expected.hex_char == actual.hex_char assert expected.hex_char == actual.hex_char



def _assert_tag_node_equal(expected, actual): def _assert_tag_node_equal(expected, actual):
"""Assert that two Tag nodes have the same data.""" """Assert that two Tag nodes have the same data."""
assert_wikicode_equal(expected.tag, actual.tag) assert_wikicode_equal(expected.tag, actual.tag)
@@ -105,6 +121,7 @@ def _assert_tag_node_equal(expected, actual):
assert expected.padding == actual.padding assert expected.padding == actual.padding
assert_wikicode_equal(expected.closing_tag, actual.closing_tag) assert_wikicode_equal(expected.closing_tag, actual.closing_tag)



def _assert_template_node_equal(expected, actual): def _assert_template_node_equal(expected, actual):
"""Assert that two Template nodes have the same data.""" """Assert that two Template nodes have the same data."""
assert_wikicode_equal(expected.name, actual.name) assert_wikicode_equal(expected.name, actual.name)
@@ -117,10 +134,12 @@ def _assert_template_node_equal(expected, actual):
assert_wikicode_equal(exp_param.value, act_param.value) assert_wikicode_equal(exp_param.value, act_param.value)
assert exp_param.showkey is act_param.showkey assert exp_param.showkey is act_param.showkey



def _assert_text_node_equal(expected, actual): def _assert_text_node_equal(expected, actual):
"""Assert that two Text nodes have the same data.""" """Assert that two Text nodes have the same data."""
assert expected.value == actual.value assert expected.value == actual.value



def _assert_wikilink_node_equal(expected, actual): def _assert_wikilink_node_equal(expected, actual):
"""Assert that two Wikilink nodes have the same data.""" """Assert that two Wikilink nodes have the same data."""
assert_wikicode_equal(expected.title, actual.title) assert_wikicode_equal(expected.title, actual.title)
@@ -129,6 +148,7 @@ def _assert_wikilink_node_equal(expected, actual):
else: else:
assert actual.text is None assert actual.text is None



def assert_wikicode_equal(expected, actual): def assert_wikicode_equal(expected, actual):
"""Assert that two Wikicode objects have the same data.""" """Assert that two Wikicode objects have the same data."""
assert isinstance(actual, Wikicode) assert isinstance(actual, Wikicode)


+ 16
- 2
tests/test_argument.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes import Argument, Text from mwparserfromhell.nodes import Argument, Text
from .conftest import assert_wikicode_equal, wrap, wraptext from .conftest import assert_wikicode_equal, wrap, wraptext



def test_str(): def test_str():
"""test Argument.__str__()""" """test Argument.__str__()"""
node = Argument(wraptext("foobar")) node = Argument(wraptext("foobar"))
@@ -34,6 +35,7 @@ def test_str():
node2 = Argument(wraptext("foo"), wraptext("bar")) node2 = Argument(wraptext("foo"), wraptext("bar"))
assert "{{{foo|bar}}}" == str(node2) assert "{{{foo|bar}}}" == str(node2)



def test_children(): def test_children():
"""test Argument.__children__()""" """test Argument.__children__()"""
node1 = Argument(wraptext("foobar")) node1 = Argument(wraptext("foobar"))
@@ -48,6 +50,7 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen2) next(gen2)



def test_strip(): def test_strip():
"""test Argument.__strip__()""" """test Argument.__strip__()"""
node1 = Argument(wraptext("foobar")) node1 = Argument(wraptext("foobar"))
@@ -55,6 +58,7 @@ def test_strip():
assert node1.__strip__() is None assert node1.__strip__() is None
assert "bar" == node2.__strip__() assert "bar" == node2.__strip__()



def test_showtree(): def test_showtree():
"""test Argument.__showtree__()""" """test Argument.__showtree__()"""
output = [] output = []
@@ -66,10 +70,19 @@ def test_showtree():
node1.__showtree__(output.append, get, mark) node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark) node2.__showtree__(output.append, get, mark)
valid = [ valid = [
"{{{", (getter, node1.name), "}}}", "{{{", (getter, node2.name),
" | ", marker, (getter, node2.default), "}}}"]
"{{{",
(getter, node1.name),
"}}}",
"{{{",
(getter, node2.name),
" | ",
marker,
(getter, node2.default),
"}}}",
]
assert valid == output assert valid == output



def test_name(): def test_name():
"""test getter/setter for the name attribute""" """test getter/setter for the name attribute"""
name = wraptext("foobar") name = wraptext("foobar")
@@ -82,6 +95,7 @@ def test_name():
assert_wikicode_equal(wraptext("héhehé"), node1.name) assert_wikicode_equal(wraptext("héhehé"), node1.name)
assert_wikicode_equal(wraptext("héhehé"), node2.name) assert_wikicode_equal(wraptext("héhehé"), node2.name)



def test_default(): def test_default():
"""test getter/setter for the default attribute""" """test getter/setter for the default attribute"""
default = wraptext("baz") default = wraptext("baz")


+ 5
- 0
tests/test_attribute.py View File

@@ -28,6 +28,7 @@ from mwparserfromhell.nodes import Template
from mwparserfromhell.nodes.extras import Attribute from mwparserfromhell.nodes.extras import Attribute
from .conftest import assert_wikicode_equal, wrap, wraptext from .conftest import assert_wikicode_equal, wrap, wraptext



def test_str(): def test_str():
"""test Attribute.__str__()""" """test Attribute.__str__()"""
node = Attribute(wraptext("foo")) node = Attribute(wraptext("foo"))
@@ -43,6 +44,7 @@ def test_str():
node6 = Attribute(wraptext("a"), wrap([]), None, " ", "", " ") node6 = Attribute(wraptext("a"), wrap([]), None, " ", "", " ")
assert " a= " == str(node6) assert " a= " == str(node6)



def test_name(): def test_name():
"""test getter/setter for the name attribute""" """test getter/setter for the name attribute"""
name = wraptext("id") name = wraptext("id")
@@ -51,6 +53,7 @@ def test_name():
node.name = "{{id}}" node.name = "{{id}}"
assert_wikicode_equal(wrap([Template(wraptext("id"))]), node.name) assert_wikicode_equal(wrap([Template(wraptext("id"))]), node.name)



def test_value(): def test_value():
"""test getter/setter for the value attribute""" """test getter/setter for the value attribute"""
value = wraptext("foo") value = wraptext("foo")
@@ -74,6 +77,7 @@ def test_value():
assert_wikicode_equal(wraptext("fo\"o 'bar' b\"az"), node2.value) assert_wikicode_equal(wraptext("fo\"o 'bar' b\"az"), node2.value)
assert '"' == node2.quotes assert '"' == node2.quotes



def test_quotes(): def test_quotes():
"""test getter/setter for the quotes attribute""" """test getter/setter for the quotes attribute"""
node1 = Attribute(wraptext("id"), wraptext("foo"), None) node1 = Attribute(wraptext("id"), wraptext("foo"), None)
@@ -92,6 +96,7 @@ def test_quotes():
with pytest.raises(ValueError): with pytest.raises(ValueError):
Attribute(wraptext("id"), wraptext("foo bar baz"), None) Attribute(wraptext("id"), wraptext("foo bar baz"), None)



def test_padding(): def test_padding():
"""test getter/setter for the padding attributes""" """test getter/setter for the padding attributes"""
for pad in ["pad_first", "pad_before_eq", "pad_after_eq"]: for pad in ["pad_first", "pad_before_eq", "pad_after_eq"]:


+ 737
- 326
tests/test_builder.py
File diff suppressed because it is too large
View File


+ 5
- 0
tests/test_comment.py View File

@@ -26,11 +26,13 @@ import pytest


from mwparserfromhell.nodes import Comment from mwparserfromhell.nodes import Comment



def test_str(): def test_str():
"""test Comment.__str__()""" """test Comment.__str__()"""
node = Comment("foobar") node = Comment("foobar")
assert "<!--foobar-->" == str(node) assert "<!--foobar-->" == str(node)



def test_children(): def test_children():
"""test Comment.__children__()""" """test Comment.__children__()"""
node = Comment("foobar") node = Comment("foobar")
@@ -38,11 +40,13 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen) next(gen)



def test_strip(): def test_strip():
"""test Comment.__strip__()""" """test Comment.__strip__()"""
node = Comment("foobar") node = Comment("foobar")
assert node.__strip__() is None assert node.__strip__() is None



def test_showtree(): def test_showtree():
"""test Comment.__showtree__()""" """test Comment.__showtree__()"""
output = [] output = []
@@ -50,6 +54,7 @@ def test_showtree():
node.__showtree__(output.append, None, None) node.__showtree__(output.append, None, None)
assert ["<!--foobar-->"] == output assert ["<!--foobar-->"] == output



def test_contents(): def test_contents():
"""test getter/setter for the contents attribute""" """test getter/setter for the contents attribute"""
node = Comment("foobar") node = Comment("foobar")


+ 12
- 6
tests/test_docs.py View File

@@ -32,6 +32,7 @@ import pytest


import mwparserfromhell import mwparserfromhell



def assert_print(value, output): def assert_print(value, output):
"""Assertion check that *value*, when printed, produces *output*.""" """Assertion check that *value*, when printed, produces *output*."""
buff = StringIO() buff = StringIO()
@@ -39,6 +40,7 @@ def assert_print(value, output):
buff.seek(0) buff.seek(0)
assert output == buff.read() assert output == buff.read()



def test_readme_1(): def test_readme_1():
"""test a block of example code in the README""" """test a block of example code in the README"""
text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?" text = "I has a template! {{foo|bar|baz|eggs=spam}} See it?"
@@ -52,6 +54,7 @@ def test_readme_1():
assert_print(template.get(1).value, "bar") assert_print(template.get(1).value, "bar")
assert_print(template.get("eggs").value, "spam") assert_print(template.get("eggs").value, "spam")



def test_readme_2(): def test_readme_2():
"""test a block of example code in the README""" """test a block of example code in the README"""
text = "{{foo|{{bar}}={{baz|{{spam}}}}}}" text = "{{foo|{{bar}}={{baz|{{spam}}}}}}"
@@ -59,17 +62,19 @@ def test_readme_2():
res = "['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}']" res = "['{{foo|{{bar}}={{baz|{{spam}}}}}}', '{{bar}}', '{{baz|{{spam}}}}', '{{spam}}']"
assert_print(temps, res) assert_print(temps, res)



def test_readme_3(): def test_readme_3():
"""test a block of example code in the README""" """test a block of example code in the README"""
code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}") code = mwparserfromhell.parse("{{foo|this {{includes a|template}}}}")
assert_print(code.filter_templates(recursive=False),
"['{{foo|this {{includes a|template}}}}']")
assert_print(
code.filter_templates(recursive=False),
"['{{foo|this {{includes a|template}}}}']",
)
foo = code.filter_templates(recursive=False)[0] foo = code.filter_templates(recursive=False)[0]
assert_print(foo.get(1).value, "this {{includes a|template}}") assert_print(foo.get(1).value, "this {{includes a|template}}")
assert_print(foo.get(1).value.filter_templates()[0],
"{{includes a|template}}")
assert_print(foo.get(1).value.filter_templates()[0].get(1).value,
"template")
assert_print(foo.get(1).value.filter_templates()[0], "{{includes a|template}}")
assert_print(foo.get(1).value.filter_templates()[0].get(1).value, "template")



def test_readme_4(): def test_readme_4():
"""test a block of example code in the README""" """test a block of example code in the README"""
@@ -90,6 +95,7 @@ def test_readme_4():
assert_print(text, res) assert_print(text, res)
assert text == code assert text == code



@pytest.mark.skipif("NOWEB" in os.environ, reason="web test disabled by environ var") @pytest.mark.skipif("NOWEB" in os.environ, reason="web test disabled by environ var")
def test_readme_5(): def test_readme_5():
"""test a block of example code in the README; includes a web call""" """test a block of example code in the README; includes a web call"""


+ 12
- 7
tests/test_external_link.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes import ExternalLink, Text from mwparserfromhell.nodes import ExternalLink, Text
from .conftest import assert_wikicode_equal, wrap, wraptext from .conftest import assert_wikicode_equal, wrap, wraptext



def test_str(): def test_str():
"""test ExternalLink.__str__()""" """test ExternalLink.__str__()"""
node = ExternalLink(wraptext("http://example.com/"), brackets=False) node = ExternalLink(wraptext("http://example.com/"), brackets=False)
@@ -35,15 +36,16 @@ def test_str():
assert "[http://example.com/]" == str(node2) assert "[http://example.com/]" == str(node2)
node3 = ExternalLink(wraptext("http://example.com/"), wrap([])) node3 = ExternalLink(wraptext("http://example.com/"), wrap([]))
assert "[http://example.com/ ]" == str(node3) assert "[http://example.com/ ]" == str(node3)
node4 = ExternalLink(wraptext("http://example.com/"),
wraptext("Example Web Page"))
node4 = ExternalLink(wraptext("http://example.com/"), wraptext("Example Web Page"))
assert "[http://example.com/ Example Web Page]" == str(node4) assert "[http://example.com/ Example Web Page]" == str(node4)



def test_children(): def test_children():
"""test ExternalLink.__children__()""" """test ExternalLink.__children__()"""
node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) node1 = ExternalLink(wraptext("http://example.com/"), brackets=False)
node2 = ExternalLink(wraptext("http://example.com/"),
wrap([Text("Example"), Text("Page")]))
node2 = ExternalLink(
wraptext("http://example.com/"), wrap([Text("Example"), Text("Page")])
)
gen1 = node1.__children__() gen1 = node1.__children__()
gen2 = node2.__children__() gen2 = node2.__children__()
assert node1.url == next(gen1) assert node1.url == next(gen1)
@@ -54,6 +56,7 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen2) next(gen2)



def test_strip(): def test_strip():
"""test ExternalLink.__strip__()""" """test ExternalLink.__strip__()"""
node1 = ExternalLink(wraptext("http://example.com"), brackets=False) node1 = ExternalLink(wraptext("http://example.com"), brackets=False)
@@ -66,6 +69,7 @@ def test_strip():
assert node3.__strip__() is None assert node3.__strip__() is None
assert "Link" == node4.__strip__() assert "Link" == node4.__strip__()



def test_showtree(): def test_showtree():
"""test ExternalLink.__showtree__()""" """test ExternalLink.__showtree__()"""
output = [] output = []
@@ -76,11 +80,10 @@ def test_showtree():
node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link"))
node1.__showtree__(output.append, get, mark) node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark) node2.__showtree__(output.append, get, mark)
valid = [
(getter, node1.url), "[", (getter, node2.url),
(getter, node2.title), "]"]
valid = [(getter, node1.url), "[", (getter, node2.url), (getter, node2.title), "]"]
assert valid == output assert valid == output



def test_url(): def test_url():
"""test getter/setter for the url attribute""" """test getter/setter for the url attribute"""
url = wraptext("http://example.com/") url = wraptext("http://example.com/")
@@ -93,6 +96,7 @@ def test_url():
assert_wikicode_equal(wraptext("mailto:héhehé@spam.com"), node1.url) assert_wikicode_equal(wraptext("mailto:héhehé@spam.com"), node1.url)
assert_wikicode_equal(wraptext("mailto:héhehé@spam.com"), node2.url) assert_wikicode_equal(wraptext("mailto:héhehé@spam.com"), node2.url)



def test_title(): def test_title():
"""test getter/setter for the title attribute""" """test getter/setter for the title attribute"""
title = wraptext("Example!") title = wraptext("Example!")
@@ -105,6 +109,7 @@ def test_title():
node2.title = "My Website" node2.title = "My Website"
assert_wikicode_equal(wraptext("My Website"), node2.title) assert_wikicode_equal(wraptext("My Website"), node2.title)



def test_brackets(): def test_brackets():
"""test getter/setter for the brackets attribute""" """test getter/setter for the brackets attribute"""
node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) node1 = ExternalLink(wraptext("http://example.com/"), brackets=False)


+ 7
- 2
tests/test_heading.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes import Heading, Text from mwparserfromhell.nodes import Heading, Text
from .conftest import assert_wikicode_equal, wrap, wraptext from .conftest import assert_wikicode_equal, wrap, wraptext



def test_str(): def test_str():
"""test Heading.__str__()""" """test Heading.__str__()"""
node = Heading(wraptext("foobar"), 2) node = Heading(wraptext("foobar"), 2)
@@ -34,6 +35,7 @@ def test_str():
node2 = Heading(wraptext(" zzz "), 5) node2 = Heading(wraptext(" zzz "), 5)
assert "===== zzz =====" == str(node2) assert "===== zzz =====" == str(node2)



def test_children(): def test_children():
"""test Heading.__children__()""" """test Heading.__children__()"""
node = Heading(wrap([Text("foo"), Text("bar")]), 3) node = Heading(wrap([Text("foo"), Text("bar")]), 3)
@@ -42,11 +44,13 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen) next(gen)



def test_strip(): def test_strip():
"""test Heading.__strip__()""" """test Heading.__strip__()"""
node = Heading(wraptext("foobar"), 3) node = Heading(wraptext("foobar"), 3)
assert "foobar" == node.__strip__() assert "foobar" == node.__strip__()



def test_showtree(): def test_showtree():
"""test Heading.__showtree__()""" """test Heading.__showtree__()"""
output = [] output = []
@@ -56,10 +60,10 @@ def test_showtree():
node2 = Heading(wraptext(" baz "), 4) node2 = Heading(wraptext(" baz "), 4)
node1.__showtree__(output.append, get, None) node1.__showtree__(output.append, get, None)
node2.__showtree__(output.append, get, None) node2.__showtree__(output.append, get, None)
valid = ["===", (getter, node1.title), "===",
"====", (getter, node2.title), "===="]
valid = ["===", (getter, node1.title), "===", "====", (getter, node2.title), "===="]
assert valid == output assert valid == output



def test_title(): def test_title():
"""test getter/setter for the title attribute""" """test getter/setter for the title attribute"""
title = wraptext("foobar") title = wraptext("foobar")
@@ -68,6 +72,7 @@ def test_title():
node.title = "héhehé" node.title = "héhehé"
assert_wikicode_equal(wraptext("héhehé"), node.title) assert_wikicode_equal(wraptext("héhehé"), node.title)



def test_level(): def test_level():
"""test getter/setter for the level attribute""" """test getter/setter for the level attribute"""
node = Heading(wraptext("foobar"), 3) node = Heading(wraptext("foobar"), 3)


+ 9
- 0
tests/test_html_entity.py View File

@@ -26,6 +26,7 @@ import pytest


from mwparserfromhell.nodes import HTMLEntity from mwparserfromhell.nodes import HTMLEntity



def test_str(): def test_str():
"""test HTMLEntity.__str__()""" """test HTMLEntity.__str__()"""
node1 = HTMLEntity("nbsp", named=True, hexadecimal=False) node1 = HTMLEntity("nbsp", named=True, hexadecimal=False)
@@ -37,6 +38,7 @@ def test_str():
assert "&#x6b;" == str(node3) assert "&#x6b;" == str(node3)
assert "&#X6C;" == str(node4) assert "&#X6C;" == str(node4)



def test_children(): def test_children():
"""test HTMLEntity.__children__()""" """test HTMLEntity.__children__()"""
node = HTMLEntity("nbsp", named=True, hexadecimal=False) node = HTMLEntity("nbsp", named=True, hexadecimal=False)
@@ -44,6 +46,7 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen) next(gen)



def test_strip(): def test_strip():
"""test HTMLEntity.__strip__()""" """test HTMLEntity.__strip__()"""
node1 = HTMLEntity("nbsp", named=True, hexadecimal=False) node1 = HTMLEntity("nbsp", named=True, hexadecimal=False)
@@ -57,6 +60,7 @@ def test_strip():
assert "é" == node3.__strip__(normalize=True) assert "é" == node3.__strip__(normalize=True)
assert "&#xe9;" == node3.__strip__(normalize=False) assert "&#xe9;" == node3.__strip__(normalize=False)



def test_showtree(): def test_showtree():
"""test HTMLEntity.__showtree__()""" """test HTMLEntity.__showtree__()"""
output = [] output = []
@@ -69,6 +73,7 @@ def test_showtree():
res = ["&nbsp;", "&#107;", "&#xe9;"] res = ["&nbsp;", "&#107;", "&#xe9;"]
assert res == output assert res == output



def test_value(): def test_value():
"""test getter/setter for the value attribute""" """test getter/setter for the value attribute"""
node1 = HTMLEntity("nbsp") node1 = HTMLEntity("nbsp")
@@ -109,6 +114,7 @@ def test_value():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node1.__setattr__("value", "12FFFF") node1.__setattr__("value", "12FFFF")



def test_named(): def test_named():
"""test getter/setter for the named attribute""" """test getter/setter for the named attribute"""
node1 = HTMLEntity("nbsp") node1 = HTMLEntity("nbsp")
@@ -130,6 +136,7 @@ def test_named():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node3.__setattr__("named", True) node3.__setattr__("named", True)



def test_hexadecimal(): def test_hexadecimal():
"""test getter/setter for the hexadecimal attribute""" """test getter/setter for the hexadecimal attribute"""
node1 = HTMLEntity("nbsp") node1 = HTMLEntity("nbsp")
@@ -147,6 +154,7 @@ def test_hexadecimal():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node1.__setattr__("hexadecimal", True) node1.__setattr__("hexadecimal", True)



def test_hex_char(): def test_hex_char():
"""test getter/setter for the hex_char attribute""" """test getter/setter for the hex_char attribute"""
node1 = HTMLEntity("e9") node1 = HTMLEntity("e9")
@@ -164,6 +172,7 @@ def test_hex_char():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node1.__setattr__("hex_char", True) node1.__setattr__("hex_char", True)



def test_normalize(): def test_normalize():
"""test getter/setter for the normalize attribute""" """test getter/setter for the normalize attribute"""
node1 = HTMLEntity("nbsp") node1 = HTMLEntity("nbsp")


+ 4
- 0
tests/test_parameter.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes.extras import Parameter from mwparserfromhell.nodes.extras import Parameter
from .conftest import assert_wikicode_equal, wraptext from .conftest import assert_wikicode_equal, wraptext



def test_str(): def test_str():
"""test Parameter.__str__()""" """test Parameter.__str__()"""
node = Parameter(wraptext("1"), wraptext("foo"), showkey=False) node = Parameter(wraptext("1"), wraptext("foo"), showkey=False)
@@ -34,6 +35,7 @@ def test_str():
node2 = Parameter(wraptext("foo"), wraptext("bar")) node2 = Parameter(wraptext("foo"), wraptext("bar"))
assert "foo=bar" == str(node2) assert "foo=bar" == str(node2)



def test_name(): def test_name():
"""test getter/setter for the name attribute""" """test getter/setter for the name attribute"""
name1 = wraptext("1") name1 = wraptext("1")
@@ -47,6 +49,7 @@ def test_name():
assert_wikicode_equal(wraptext("héhehé"), node1.name) assert_wikicode_equal(wraptext("héhehé"), node1.name)
assert_wikicode_equal(wraptext("héhehé"), node2.name) assert_wikicode_equal(wraptext("héhehé"), node2.name)



def test_value(): def test_value():
"""test getter/setter for the value attribute""" """test getter/setter for the value attribute"""
value = wraptext("bar") value = wraptext("bar")
@@ -55,6 +58,7 @@ def test_value():
node.value = "héhehé" node.value = "héhehé"
assert_wikicode_equal(wraptext("héhehé"), node.value) assert_wikicode_equal(wraptext("héhehé"), node.value)



def test_showkey(): def test_showkey():
"""test getter/setter for the showkey attribute""" """test getter/setter for the showkey attribute"""
node1 = Parameter(wraptext("1"), wraptext("foo"), showkey=False) node1 = Parameter(wraptext("1"), wraptext("foo"), showkey=False)


+ 43
- 19
tests/test_parser.py View File

@@ -29,6 +29,7 @@ from mwparserfromhell.nodes import Tag, Template, Text, Wikilink
from mwparserfromhell.nodes.extras import Parameter from mwparserfromhell.nodes.extras import Parameter
from .conftest import assert_wikicode_equal, wrap, wraptext from .conftest import assert_wikicode_equal, wrap, wraptext



@pytest.fixture() @pytest.fixture()
def pyparser(): def pyparser():
"""make sure the correct tokenizer is used""" """make sure the correct tokenizer is used"""
@@ -38,37 +39,60 @@ def pyparser():
yield yield
parser.use_c = restore parser.use_c = restore



def test_use_c(pyparser): def test_use_c(pyparser):
assert parser.Parser()._tokenizer.USES_C is False assert parser.Parser()._tokenizer.USES_C is False



def test_parsing(pyparser): def test_parsing(pyparser):
"""integration test for parsing overall""" """integration test for parsing overall"""
text = "this is text; {{this|is=a|template={{with|[[links]]|in}}it}}" text = "this is text; {{this|is=a|template={{with|[[links]]|in}}it}}"
expected = wrap([
Text("this is text; "),
Template(wraptext("this"), [
Parameter(wraptext("is"), wraptext("a")),
Parameter(wraptext("template"), wrap([
Template(wraptext("with"), [
Parameter(wraptext("1"),
wrap([Wikilink(wraptext("links"))]),
showkey=False),
Parameter(wraptext("2"),
wraptext("in"), showkey=False)
]),
Text("it")
]))
])
])
expected = wrap(
[
Text("this is text; "),
Template(
wraptext("this"),
[
Parameter(wraptext("is"), wraptext("a")),
Parameter(
wraptext("template"),
wrap(
[
Template(
wraptext("with"),
[
Parameter(
wraptext("1"),
wrap([Wikilink(wraptext("links"))]),
showkey=False,
),
Parameter(
wraptext("2"), wraptext("in"), showkey=False
),
],
),
Text("it"),
]
),
),
],
),
]
)
actual = parser.Parser().parse(text) actual = parser.Parser().parse(text)
assert_wikicode_equal(expected, actual) assert_wikicode_equal(expected, actual)



def test_skip_style_tags(pyparser): def test_skip_style_tags(pyparser):
"""test Parser.parse(skip_style_tags=True)""" """test Parser.parse(skip_style_tags=True)"""
text = "This is an example with ''italics''!" text = "This is an example with ''italics''!"
a = wrap([Text("This is an example with "),
Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"),
Text("!")])
a = wrap(
[
Text("This is an example with "),
Tag(wraptext("i"), wraptext("italics"), wiki_markup="''"),
Text("!"),
]
)
b = wraptext("This is an example with ''italics''!") b = wraptext("This is an example with ''italics''!")


with_style = parser.Parser().parse(text, skip_style_tags=False) with_style = parser.Parser().parse(text, skip_style_tags=False)


+ 27
- 2
tests/test_smart_list.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.smart_list import SmartList from mwparserfromhell.smart_list import SmartList
from mwparserfromhell.smart_list.list_proxy import ListProxy from mwparserfromhell.smart_list.list_proxy import ListProxy



def _test_get_set_del_item(builder): def _test_get_set_del_item(builder):
"""Run tests on __get/set/delitem__ of a list built with *builder*.""" """Run tests on __get/set/delitem__ of a list built with *builder*."""
list1 = builder([0, 1, 2, 3, "one", "two"]) list1 = builder([0, 1, 2, 3, "one", "two"])
@@ -104,6 +105,7 @@ def _test_get_set_del_item(builder):
del list2[2:8:2] del list2[2:8:2]
assert [0, 1, 3, 5, 7, 8, 9] == list2 assert [0, 1, 3, 5, 7, 8, 9] == list2



def _test_add_radd_iadd(builder): def _test_add_radd_iadd(builder):
"""Run tests on __r/i/add__ of a list built with *builder*.""" """Run tests on __r/i/add__ of a list built with *builder*."""
list1 = builder(range(5)) list1 = builder(range(5))
@@ -116,6 +118,7 @@ def _test_add_radd_iadd(builder):
list1 += ["foo", "bar", "baz"] list1 += ["foo", "bar", "baz"]
assert [0, 1, 2, 3, 4, "foo", "bar", "baz"] == list1 assert [0, 1, 2, 3, 4, "foo", "bar", "baz"] == list1



def _test_other_magic_methods(builder): def _test_other_magic_methods(builder):
"""Run tests on other magic methods of a list built with *builder*.""" """Run tests on other magic methods of a list built with *builder*."""
list1 = builder([0, 1, 2, 3, "one", "two"]) list1 = builder([0, 1, 2, 3, "one", "two"])
@@ -200,6 +203,7 @@ def _test_other_magic_methods(builder):
list4 *= 2 list4 *= 2
assert [0, 1, 2, 0, 1, 2] == list4 assert [0, 1, 2, 0, 1, 2] == list4



def _test_list_methods(builder): def _test_list_methods(builder):
"""Run tests on the public methods of a list built with *builder*.""" """Run tests on the public methods of a list built with *builder*."""
list1 = builder(range(5)) list1 = builder(range(5))
@@ -263,6 +267,7 @@ def _test_list_methods(builder):
list3.sort(key=lambda i: i[1], reverse=True) list3.sort(key=lambda i: i[1], reverse=True)
assert [("b", 8), ("a", 5), ("c", 3), ("d", 2)] == list3 assert [("b", 8), ("a", 5), ("c", 3), ("d", 2)] == list3



def _dispatch_test_for_children(meth): def _dispatch_test_for_children(meth):
"""Run a test method on various different types of children.""" """Run a test method on various different types of children."""
meth(lambda L: SmartList(list(L))[:]) meth(lambda L: SmartList(list(L))[:])
@@ -270,10 +275,20 @@ def _dispatch_test_for_children(meth):
meth(lambda L: SmartList(list(L) + [999])[:-1]) meth(lambda L: SmartList(list(L) + [999])[:-1])
meth(lambda L: SmartList([101, 102] + list(L) + [201, 202])[2:-2]) meth(lambda L: SmartList([101, 102] + list(L) + [201, 202])[2:-2])



def test_docs(): def test_docs():
"""make sure the methods of SmartList/ListProxy have docstrings""" """make sure the methods of SmartList/ListProxy have docstrings"""
methods = ["append", "count", "extend", "index", "insert", "pop",
"remove", "reverse", "sort"]
methods = [
"append",
"count",
"extend",
"index",
"insert",
"pop",
"remove",
"reverse",
"sort",
]
for meth in methods: for meth in methods:
expected = getattr(list, meth).__doc__ expected = getattr(list, meth).__doc__
smartlist_doc = getattr(SmartList, meth).__doc__ smartlist_doc = getattr(SmartList, meth).__doc__
@@ -281,6 +296,7 @@ def test_docs():
assert expected == smartlist_doc assert expected == smartlist_doc
assert expected == listproxy_doc assert expected == listproxy_doc



def test_doctest(): def test_doctest():
"""make sure the test embedded in SmartList's docstring passes""" """make sure the test embedded in SmartList's docstring passes"""
parent = SmartList([0, 1, 2, 3]) parent = SmartList([0, 1, 2, 3])
@@ -291,38 +307,47 @@ def test_doctest():
assert [2, 3, 4] == child assert [2, 3, 4] == child
assert [0, 1, 2, 3, 4] == parent assert [0, 1, 2, 3, 4] == parent



def test_parent_get_set_del(): def test_parent_get_set_del():
"""make sure SmartList's getitem/setitem/delitem work""" """make sure SmartList's getitem/setitem/delitem work"""
_test_get_set_del_item(SmartList) _test_get_set_del_item(SmartList)



def test_parent_add(): def test_parent_add():
"""make sure SmartList's add/radd/iadd work""" """make sure SmartList's add/radd/iadd work"""
_test_add_radd_iadd(SmartList) _test_add_radd_iadd(SmartList)



def test_parent_other_magics(): def test_parent_other_magics():
"""make sure SmartList's other magically implemented features work""" """make sure SmartList's other magically implemented features work"""
_test_other_magic_methods(SmartList) _test_other_magic_methods(SmartList)



def test_parent_methods(): def test_parent_methods():
"""make sure SmartList's non-magic methods work, like append()""" """make sure SmartList's non-magic methods work, like append()"""
_test_list_methods(SmartList) _test_list_methods(SmartList)



def test_child_get_set_del(): def test_child_get_set_del():
"""make sure ListProxy's getitem/setitem/delitem work""" """make sure ListProxy's getitem/setitem/delitem work"""
_dispatch_test_for_children(_test_get_set_del_item) _dispatch_test_for_children(_test_get_set_del_item)



def test_child_add(): def test_child_add():
"""make sure ListProxy's add/radd/iadd work""" """make sure ListProxy's add/radd/iadd work"""
_dispatch_test_for_children(_test_add_radd_iadd) _dispatch_test_for_children(_test_add_radd_iadd)



def test_child_other_magics(): def test_child_other_magics():
"""make sure ListProxy's other magically implemented features work""" """make sure ListProxy's other magically implemented features work"""
_dispatch_test_for_children(_test_other_magic_methods) _dispatch_test_for_children(_test_other_magic_methods)



def test_child_methods(): def test_child_methods():
"""make sure ListProxy's non-magic methods work, like append()""" """make sure ListProxy's non-magic methods work, like append()"""
_dispatch_test_for_children(_test_list_methods) _dispatch_test_for_children(_test_list_methods)



def test_influence(): def test_influence():
"""make sure changes are propagated from parents to children""" """make sure changes are propagated from parents to children"""
parent = SmartList([0, 1, 2, 3, 4, 5]) parent = SmartList([0, 1, 2, 3, 4, 5])


+ 95
- 20
tests/test_string_mixin.py View File

@@ -29,6 +29,7 @@ import pytest


from mwparserfromhell.string_mixin import StringMixIn from mwparserfromhell.string_mixin import StringMixIn



class _FakeString(StringMixIn): class _FakeString(StringMixIn):
def __init__(self, data): def __init__(self, data):
self._data = data self._data = data
@@ -36,22 +37,63 @@ class _FakeString(StringMixIn):
def __str__(self): def __str__(self):
return self._data return self._data


@pytest.mark.parametrize('method', [
"capitalize", "casefold", "center", "count", "encode", "endswith",
"expandtabs", "find", "format", "format_map", "index", "isalnum",
"isalpha", "isdecimal", "isdigit", "isidentifier", "islower",
"isnumeric", "isprintable", "isspace", "istitle", "isupper",
"join", "ljust", "lower", "lstrip", "maketrans", "partition",
"replace", "rfind", "rindex", "rjust", "rpartition", "rsplit",
"rstrip", "split", "splitlines", "startswith", "strip", "swapcase",
"title", "translate", "upper", "zfill"
])

@pytest.mark.parametrize(
"method",
[
"capitalize",
"casefold",
"center",
"count",
"encode",
"endswith",
"expandtabs",
"find",
"format",
"format_map",
"index",
"isalnum",
"isalpha",
"isdecimal",
"isdigit",
"isidentifier",
"islower",
"isnumeric",
"isprintable",
"isspace",
"istitle",
"isupper",
"join",
"ljust",
"lower",
"lstrip",
"maketrans",
"partition",
"replace",
"rfind",
"rindex",
"rjust",
"rpartition",
"rsplit",
"rstrip",
"split",
"splitlines",
"startswith",
"strip",
"swapcase",
"title",
"translate",
"upper",
"zfill",
],
)
def test_docs(method): def test_docs(method):
"""make sure the various methods of StringMixIn have docstrings""" """make sure the various methods of StringMixIn have docstrings"""
expected = getattr("foo", method).__doc__ expected = getattr("foo", method).__doc__
actual = getattr(_FakeString("foo"), method).__doc__ actual = getattr(_FakeString("foo"), method).__doc__
assert expected == actual assert expected == actual



def test_types(): def test_types():
"""make sure StringMixIns convert to different types correctly""" """make sure StringMixIns convert to different types correctly"""
fstr = _FakeString("fake string") fstr = _FakeString("fake string")
@@ -63,6 +105,7 @@ def test_types():
assert isinstance(bytes(fstr), bytes) assert isinstance(bytes(fstr), bytes)
assert isinstance(repr(fstr), str) assert isinstance(repr(fstr), str)



def test_comparisons(): def test_comparisons():
"""make sure comparison operators work""" """make sure comparison operators work"""
str1 = _FakeString("this is a fake string") str1 = _FakeString("this is a fake string")
@@ -99,6 +142,7 @@ def test_comparisons():
assert str5 < str1 assert str5 < str1
assert str5 <= str1 assert str5 <= str1



def test_other_magics(): def test_other_magics():
"""test other magically implemented features, like len() and iter()""" """test other magically implemented features, like len() and iter()"""
str1 = _FakeString("fake string") str1 = _FakeString("fake string")
@@ -154,6 +198,7 @@ def test_other_magics():
assert "real" not in str1 assert "real" not in str1
assert "s" not in str2 assert "s" not in str2



def test_other_methods(): def test_other_methods():
"""test the remaining non-magic methods of StringMixIn""" """test the remaining non-magic methods of StringMixIn"""
str1 = _FakeString("fake string") str1 = _FakeString("fake string")
@@ -354,8 +399,21 @@ def test_other_methods():
actual = ["this", "is", "a", "sentence", "with", "whitespace"] actual = ["this", "is", "a", "sentence", "with", "whitespace"]
assert actual == str25.rsplit() assert actual == str25.rsplit()
assert actual == str25.rsplit(None) assert actual == str25.rsplit(None)
actual = ["", "", "", "this", "is", "a", "", "", "sentence", "with",
"", "whitespace", ""]
actual = [
"",
"",
"",
"this",
"is",
"a",
"",
"",
"sentence",
"with",
"",
"whitespace",
"",
]
assert actual == str25.rsplit(" ") assert actual == str25.rsplit(" ")
actual = [" this is a", "sentence", "with", "whitespace"] actual = [" this is a", "sentence", "with", "whitespace"]
assert actual == str25.rsplit(None, 3) assert actual == str25.rsplit(None, 3)
@@ -371,8 +429,21 @@ def test_other_methods():
actual = ["this", "is", "a", "sentence", "with", "whitespace"] actual = ["this", "is", "a", "sentence", "with", "whitespace"]
assert actual == str25.split() assert actual == str25.split()
assert actual == str25.split(None) assert actual == str25.split(None)
actual = ["", "", "", "this", "is", "a", "", "", "sentence", "with",
"", "whitespace", ""]
actual = [
"",
"",
"",
"this",
"is",
"a",
"",
"",
"sentence",
"with",
"",
"whitespace",
"",
]
assert actual == str25.split(" ") assert actual == str25.split(" ")
actual = ["this", "is", "a", "sentence with whitespace "] actual = ["this", "is", "a", "sentence with whitespace "]
assert actual == str25.split(None, 3) assert actual == str25.split(None, 3)
@@ -382,10 +453,15 @@ def test_other_methods():
assert actual == str25.split(maxsplit=3) assert actual == str25.split(maxsplit=3)


str26 = _FakeString("lines\nof\ntext\r\nare\r\npresented\nhere") str26 = _FakeString("lines\nof\ntext\r\nare\r\npresented\nhere")
assert ["lines", "of", "text", "are", "presented", "here"] \
== str26.splitlines()
assert ["lines\n", "of\n", "text\r\n", "are\r\n", "presented\n", "here"] \
== str26.splitlines(True)
assert ["lines", "of", "text", "are", "presented", "here"] == str26.splitlines()
assert [
"lines\n",
"of\n",
"text\r\n",
"are\r\n",
"presented\n",
"here",
] == str26.splitlines(True)


assert str1.startswith("fake") is True assert str1.startswith("fake") is True
assert str1.startswith("faker") is False assert str1.startswith("faker") is False
@@ -398,8 +474,7 @@ def test_other_methods():


assert "Fake String" == str1.title() assert "Fake String" == str1.title()


table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3",
111: "4", 117: "5"})
table1 = StringMixIn.maketrans({97: "1", 101: "2", 105: "3", 111: "4", 117: "5"})
table2 = StringMixIn.maketrans("aeiou", "12345") table2 = StringMixIn.maketrans("aeiou", "12345")
table3 = StringMixIn.maketrans("aeiou", "12345", "rts") table3 = StringMixIn.maketrans("aeiou", "12345", "rts")
assert "f1k2 str3ng" == str1.translate(table1) assert "f1k2 str3ng" == str1.translate(table1)


+ 99
- 40
tests/test_tag.py View File

@@ -34,21 +34,20 @@ agennq = lambda name, value: Attribute(wraptext(name), wraptext(value), None)
agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, '"', a, b, c) agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, '"', a, b, c)
agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, '"', a, b, c) agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, '"', a, b, c)



def test_str(): def test_str():
"""test Tag.__str__()""" """test Tag.__str__()"""
node1 = Tag(wraptext("ref")) node1 = Tag(wraptext("ref"))
node2 = Tag(wraptext("span"), wraptext("foo"),
[agen("style", "color: red;")])
node3 = Tag(wraptext("ref"),
attrs=[agennq("name", "foo"),
agenpnv("some_attr", " ", "", "")],
self_closing=True)
node2 = Tag(wraptext("span"), wraptext("foo"), [agen("style", "color: red;")])
node3 = Tag(
wraptext("ref"),
attrs=[agennq("name", "foo"), agenpnv("some_attr", " ", "", "")],
self_closing=True,
)
node4 = Tag(wraptext("br"), self_closing=True, padding=" ") node4 = Tag(wraptext("br"), self_closing=True, padding=" ")
node5 = Tag(wraptext("br"), self_closing=True, implicit=True) node5 = Tag(wraptext("br"), self_closing=True, implicit=True)
node6 = Tag(wraptext("br"), self_closing=True, invalid=True,
implicit=True)
node7 = Tag(wraptext("br"), self_closing=True, invalid=True,
padding=" ")
node6 = Tag(wraptext("br"), self_closing=True, invalid=True, implicit=True)
node7 = Tag(wraptext("br"), self_closing=True, invalid=True, padding=" ")
node8 = Tag(wraptext("hr"), wiki_markup="----", self_closing=True) node8 = Tag(wraptext("hr"), wiki_markup="----", self_closing=True)
node9 = Tag(wraptext("i"), wraptext("italics!"), wiki_markup="''") node9 = Tag(wraptext("i"), wraptext("italics!"), wiki_markup="''")


@@ -62,6 +61,7 @@ def test_str():
assert "----" == str(node8) assert "----" == str(node8)
assert "''italics!''" == str(node9) assert "''italics!''" == str(node9)



def test_children(): def test_children():
"""test Tag.__children__()""" """test Tag.__children__()"""
# <ref>foobar</ref> # <ref>foobar</ref>
@@ -69,10 +69,12 @@ def test_children():
# '''bold text''' # '''bold text'''
node2 = Tag(wraptext("b"), wraptext("bold text"), wiki_markup="'''") node2 = Tag(wraptext("b"), wraptext("bold text"), wiki_markup="'''")
# <img id="foo" class="bar" selected /> # <img id="foo" class="bar" selected />
node3 = Tag(wraptext("img"),
attrs=[agen("id", "foo"), agen("class", "bar"),
agennv("selected")],
self_closing=True, padding=" ")
node3 = Tag(
wraptext("img"),
attrs=[agen("id", "foo"), agen("class", "bar"), agennv("selected")],
self_closing=True,
padding=" ",
)


gen1 = node1.__children__() gen1 = node1.__children__()
gen2 = node2.__children__() gen2 = node2.__children__()
@@ -94,6 +96,7 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen3) next(gen3)



def test_strip(): def test_strip():
"""test Tag.__strip__()""" """test Tag.__strip__()"""
node1 = Tag(wraptext("i"), wraptext("foobar")) node1 = Tag(wraptext("i"), wraptext("foobar"))
@@ -104,28 +107,46 @@ def test_strip():
assert node2.__strip__() is None assert node2.__strip__() is None
assert node3.__strip__() is None assert node3.__strip__() is None



def test_showtree(): def test_showtree():
"""test Tag.__showtree__()""" """test Tag.__showtree__()"""
output = [] output = []
getter, marker = object(), object() getter, marker = object(), object()
get = lambda code: output.append((getter, code)) get = lambda code: output.append((getter, code))
mark = lambda: output.append(marker) mark = lambda: output.append(marker)
node1 = Tag(wraptext("ref"), wraptext("text"),
[agen("name", "foo"), agennv("selected")])
node1 = Tag(
wraptext("ref"), wraptext("text"), [agen("name", "foo"), agennv("selected")]
)
node2 = Tag(wraptext("br"), self_closing=True, padding=" ") node2 = Tag(wraptext("br"), self_closing=True, padding=" ")
node3 = Tag(wraptext("br"), self_closing=True, invalid=True,
implicit=True, padding=" ")
node3 = Tag(
wraptext("br"), self_closing=True, invalid=True, implicit=True, padding=" "
)
node1.__showtree__(output.append, get, mark) node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark) node2.__showtree__(output.append, get, mark)
node3.__showtree__(output.append, get, mark) node3.__showtree__(output.append, get, mark)
valid = [ valid = [
"<", (getter, node1.tag), (getter, node1.attributes[0].name),
" = ", marker, (getter, node1.attributes[0].value),
(getter, node1.attributes[1].name), ">", (getter, node1.contents),
"</", (getter, node1.closing_tag), ">", "<", (getter, node2.tag),
"/>", "</", (getter, node3.tag), ">"]
"<",
(getter, node1.tag),
(getter, node1.attributes[0].name),
" = ",
marker,
(getter, node1.attributes[0].value),
(getter, node1.attributes[1].name),
">",
(getter, node1.contents),
"</",
(getter, node1.closing_tag),
">",
"<",
(getter, node2.tag),
"/>",
"</",
(getter, node3.tag),
">",
]
assert valid == output assert valid == output



def test_tag(): def test_tag():
"""test getter/setter for the tag attribute""" """test getter/setter for the tag attribute"""
tag = wraptext("ref") tag = wraptext("ref")
@@ -137,6 +158,7 @@ def test_tag():
assert_wikicode_equal(wraptext("span"), node.closing_tag) assert_wikicode_equal(wraptext("span"), node.closing_tag)
assert "<span>text</span>" == node assert "<span>text</span>" == node



def test_contents(): def test_contents():
"""test getter/setter for the contents attribute""" """test getter/setter for the contents attribute"""
contents = wraptext("text") contents = wraptext("text")
@@ -147,6 +169,7 @@ def test_contents():
assert_wikicode_equal(parsed, node.contents) assert_wikicode_equal(parsed, node.contents)
assert "<ref>text and a {{template}}</ref>" == node assert "<ref>text and a {{template}}</ref>" == node



def test_attributes(): def test_attributes():
"""test getter for the attributes attribute""" """test getter for the attributes attribute"""
attrs = [agen("name", "bar")] attrs = [agen("name", "bar")]
@@ -155,6 +178,7 @@ def test_attributes():
assert [] == node1.attributes assert [] == node1.attributes
assert attrs is node2.attributes assert attrs is node2.attributes



def test_wiki_markup(): def test_wiki_markup():
"""test getter/setter for the wiki_markup attribute""" """test getter/setter for the wiki_markup attribute"""
node = Tag(wraptext("i"), wraptext("italic text")) node = Tag(wraptext("i"), wraptext("italic text"))
@@ -166,6 +190,7 @@ def test_wiki_markup():
assert node.wiki_markup is None assert node.wiki_markup is None
assert "<i>italic text</i>" == node assert "<i>italic text</i>" == node



def test_self_closing(): def test_self_closing():
"""test getter/setter for the self_closing attribute""" """test getter/setter for the self_closing attribute"""
node = Tag(wraptext("ref"), wraptext("foobar")) node = Tag(wraptext("ref"), wraptext("foobar"))
@@ -177,6 +202,7 @@ def test_self_closing():
assert node.self_closing is False assert node.self_closing is False
assert "<ref>foobar</ref>" == node assert "<ref>foobar</ref>" == node



def test_invalid(): def test_invalid():
"""test getter/setter for the invalid attribute""" """test getter/setter for the invalid attribute"""
node = Tag(wraptext("br"), self_closing=True, implicit=True) node = Tag(wraptext("br"), self_closing=True, implicit=True)
@@ -188,6 +214,7 @@ def test_invalid():
assert node.invalid is False assert node.invalid is False
assert "<br>" == node assert "<br>" == node



def test_implicit(): def test_implicit():
"""test getter/setter for the implicit attribute""" """test getter/setter for the implicit attribute"""
node = Tag(wraptext("br"), self_closing=True) node = Tag(wraptext("br"), self_closing=True)
@@ -199,6 +226,7 @@ def test_implicit():
assert node.implicit is False assert node.implicit is False
assert "<br/>" == node assert "<br/>" == node



def test_padding(): def test_padding():
"""test getter/setter for the padding attribute""" """test getter/setter for the padding attribute"""
node = Tag(wraptext("ref"), wraptext("foobar")) node = Tag(wraptext("ref"), wraptext("foobar"))
@@ -212,6 +240,7 @@ def test_padding():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node.__setattr__("padding", True) node.__setattr__("padding", True)



def test_closing_tag(): def test_closing_tag():
"""test getter/setter for the closing_tag attribute""" """test getter/setter for the closing_tag attribute"""
tag = wraptext("ref") tag = wraptext("ref")
@@ -222,6 +251,7 @@ def test_closing_tag():
assert_wikicode_equal(parsed, node.closing_tag) assert_wikicode_equal(parsed, node.closing_tag)
assert "<ref>foobar</ref {{ignore me}}>" == node assert "<ref>foobar</ref {{ignore me}}>" == node



def test_wiki_style_separator(): def test_wiki_style_separator():
"""test getter/setter for wiki_style_separator attribute""" """test getter/setter for wiki_style_separator attribute"""
node = Tag(wraptext("table"), wraptext("\n")) node = Tag(wraptext("table"), wraptext("\n"))
@@ -233,6 +263,7 @@ def test_wiki_style_separator():
node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|") node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|")
assert "|" == node2.wiki_style_separator assert "|" == node2.wiki_style_separator



def test_closing_wiki_markup(): def test_closing_wiki_markup():
"""test getter/setter for closing_wiki_markup attribute""" """test getter/setter for closing_wiki_markup attribute"""
node = Tag(wraptext("table"), wraptext("\n")) node = Tag(wraptext("table"), wraptext("\n"))
@@ -248,12 +279,17 @@ def test_closing_wiki_markup():
node.wiki_markup = False node.wiki_markup = False
assert node.closing_wiki_markup is None assert node.closing_wiki_markup is None
assert "<table>\n</table>" == node assert "<table>\n</table>" == node
node2 = Tag(wraptext("table"), wraptext("\n"),
attrs=[agen("id", "foo")], wiki_markup="{|",
closing_wiki_markup="|}")
node2 = Tag(
wraptext("table"),
wraptext("\n"),
attrs=[agen("id", "foo")],
wiki_markup="{|",
closing_wiki_markup="|}",
)
assert "|}" == node2.closing_wiki_markup assert "|}" == node2.closing_wiki_markup
assert '{| id="foo"\n|}' == node2 assert '{| id="foo"\n|}' == node2



def test_has(): def test_has():
"""test Tag.has()""" """test Tag.has()"""
node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")])
@@ -263,19 +299,26 @@ def test_has():
assert node.has("Name") is False assert node.has("Name") is False
assert node.has("foo") is False assert node.has("foo") is False


attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")]
attrs = [
agen("id", "foo"),
agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"),
agenpnv("foo", " ", " \n ", " \t"),
]
node2 = Tag(wraptext("div"), attrs=attrs, self_closing=True) node2 = Tag(wraptext("div"), attrs=attrs, self_closing=True)
assert node2.has("id") is True assert node2.has("id") is True
assert node2.has("class") is True assert node2.has("class") is True
assert node2.has(attrs[1].pad_first + str(attrs[1].name) +
attrs[1].pad_before_eq) is True
assert (
node2.has(attrs[1].pad_first + str(attrs[1].name) + attrs[1].pad_before_eq)
is True
)
assert node2.has(attrs[3]) is True assert node2.has(attrs[3]) is True
assert node2.has(str(attrs[3])) is True assert node2.has(str(attrs[3])) is True
assert node2.has("idclass") is False assert node2.has("idclass") is False
assert node2.has("id class") is False assert node2.has("id class") is False
assert node2.has("id=foo") is False assert node2.has("id=foo") is False



def test_get(): def test_get():
"""test Tag.get()""" """test Tag.get()"""
attrs = [agen("name", "foo")] attrs = [agen("name", "foo")]
@@ -288,13 +331,18 @@ def test_get():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node.get("foo") node.get("foo")


attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")]
attrs = [
agen("id", "foo"),
agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"),
agenpnv("foo", " ", " \n ", " \t"),
]
node2 = Tag(wraptext("div"), attrs=attrs, self_closing=True) node2 = Tag(wraptext("div"), attrs=attrs, self_closing=True)
assert attrs[0] is node2.get("id") assert attrs[0] is node2.get("id")
assert attrs[1] is node2.get("class") assert attrs[1] is node2.get("class")
assert attrs[1] is node2.get( assert attrs[1] is node2.get(
attrs[1].pad_first + str(attrs[1].name) + attrs[1].pad_before_eq)
attrs[1].pad_first + str(attrs[1].name) + attrs[1].pad_before_eq
)
assert attrs[3] is node2.get(attrs[3]) assert attrs[3] is node2.get(attrs[3])
assert attrs[3] is node2.get(str(attrs[3])) assert attrs[3] is node2.get(str(attrs[3]))
assert attrs[3] is node2.get(" foo") assert attrs[3] is node2.get(" foo")
@@ -305,6 +353,7 @@ def test_get():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node2.get("id=foo") node2.get("id=foo")



def test_add(): def test_add():
"""test Tag.add()""" """test Tag.add()"""
node = Tag(wraptext("ref"), wraptext("cite")) node = Tag(wraptext("ref"), wraptext("cite"))
@@ -330,19 +379,29 @@ def test_add():
assert attr6 == node.attributes[5] assert attr6 == node.attributes[5]
assert attr7 == node.attributes[6] assert attr7 == node.attributes[6]
assert attr7 == node.get("name") assert attr7 == node.get("name")
assert_wikicode_equal(wrap([Template(wraptext("foobar"))]),
node.attributes[5].value)
assert "".join(("<ref", attr1, attr2, attr3, attr4, attr5,
attr6, attr7, ">cite</ref>")) == node
assert_wikicode_equal(
wrap([Template(wraptext("foobar"))]), node.attributes[5].value
)
assert (
"".join(
("<ref", attr1, attr2, attr3, attr4, attr5, attr6, attr7, ">cite</ref>")
)
== node
)
with pytest.raises(ValueError): with pytest.raises(ValueError):
node.add("name", "foo", quotes="bar") node.add("name", "foo", quotes="bar")
with pytest.raises(ValueError): with pytest.raises(ValueError):
node.add("name", "a bc d", quotes=None) node.add("name", "a bc d", quotes=None)



def test_remove(): def test_remove():
"""test Tag.remove()""" """test Tag.remove()"""
attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")]
attrs = [
agen("id", "foo"),
agenp("class", "bar", " ", "\n", "\n"),
agen("foo", "bar"),
agenpnv("foo", " ", " \n ", " \t"),
]
node = Tag(wraptext("div"), attrs=attrs, self_closing=True) node = Tag(wraptext("div"), attrs=attrs, self_closing=True)
node.remove("class") node.remove("class")
assert '<div id="foo" foo="bar" foo \n />' == node assert '<div id="foo" foo="bar" foo \n />' == node
@@ -351,4 +410,4 @@ def test_remove():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node.remove("foo") node.remove("foo")
node.remove("id") node.remove("id")
assert '<div/>' == node
assert "<div/>" == node

+ 267
- 140
tests/test_template.py View File

@@ -34,19 +34,19 @@ from .conftest import assert_wikicode_equal, wrap, wraptext
pgens = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=True) pgens = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=True)
pgenh = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=False) pgenh = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=False)



def test_str(): def test_str():
"""test Template.__str__()""" """test Template.__str__()"""
node = Template(wraptext("foobar")) node = Template(wraptext("foobar"))
assert "{{foobar}}" == str(node) assert "{{foobar}}" == str(node)
node2 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("abc", "def")])
node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("abc", "def")])
assert "{{foo|bar|abc=def}}" == str(node2) assert "{{foo|bar|abc=def}}" == str(node2)



def test_children(): def test_children():
"""test Template.__children__()""" """test Template.__children__()"""
node2p1 = Parameter(wraptext("1"), wraptext("bar"), showkey=False) node2p1 = Parameter(wraptext("1"), wraptext("bar"), showkey=False)
node2p2 = Parameter(wraptext("abc"), wrap([Text("def"), Text("ghi")]),
showkey=True)
node2p2 = Parameter(wraptext("abc"), wrap([Text("def"), Text("ghi")]), showkey=True)
node1 = Template(wraptext("foobar")) node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"), [node2p1, node2p2]) node2 = Template(wraptext("foo"), [node2p1, node2p2])


@@ -62,16 +62,23 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen2) next(gen2)



def test_strip(): def test_strip():
"""test Template.__strip__()""" """test Template.__strip__()"""
node1 = Template(wraptext("foobar")) node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"), [
pgenh("1", "bar"), pgens("foo", ""), pgens("abc", "def")])
node3 = Template(wraptext("foo"), [
pgenh("1", "foo"),
Parameter(wraptext("2"), wrap([Template(wraptext("hello"))]),
showkey=False),
pgenh("3", "bar")])
node2 = Template(
wraptext("foo"), [pgenh("1", "bar"), pgens("foo", ""), pgens("abc", "def")]
)
node3 = Template(
wraptext("foo"),
[
pgenh("1", "foo"),
Parameter(
wraptext("2"), wrap([Template(wraptext("hello"))]), showkey=False
),
pgenh("3", "bar"),
],
)


assert node1.__strip__(keep_template_params=False) is None assert node1.__strip__(keep_template_params=False) is None
assert node2.__strip__(keep_template_params=False) is None assert node2.__strip__(keep_template_params=False) is None
@@ -79,6 +86,7 @@ def test_strip():
assert "bar def" == node2.__strip__(keep_template_params=True) assert "bar def" == node2.__strip__(keep_template_params=True)
assert "foo bar" == node3.__strip__(keep_template_params=True) assert "foo bar" == node3.__strip__(keep_template_params=True)



def test_showtree(): def test_showtree():
"""test Template.__showtree__()""" """test Template.__showtree__()"""
output = [] output = []
@@ -86,18 +94,32 @@ def test_showtree():
get = lambda code: output.append((getter, code)) get = lambda code: output.append((getter, code))
mark = lambda: output.append(marker) mark = lambda: output.append(marker)
node1 = Template(wraptext("foobar")) node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("abc", "def")])
node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("abc", "def")])
node1.__showtree__(output.append, get, mark) node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark) node2.__showtree__(output.append, get, mark)
valid = [ valid = [
"{{", (getter, node1.name), "}}", "{{", (getter, node2.name),
" | ", marker, (getter, node2.params[0].name), " = ", marker,
(getter, node2.params[0].value), " | ", marker,
(getter, node2.params[1].name), " = ", marker,
(getter, node2.params[1].value), "}}"]
"{{",
(getter, node1.name),
"}}",
"{{",
(getter, node2.name),
" | ",
marker,
(getter, node2.params[0].name),
" = ",
marker,
(getter, node2.params[0].value),
" | ",
marker,
(getter, node2.params[1].name),
" = ",
marker,
(getter, node2.params[1].value),
"}}",
]
assert valid == output assert valid == output



def test_name(): def test_name():
"""test getter/setter for the name attribute""" """test getter/setter for the name attribute"""
name = wraptext("foobar") name = wraptext("foobar")
@@ -110,6 +132,7 @@ def test_name():
assert_wikicode_equal(wraptext("asdf"), node1.name) assert_wikicode_equal(wraptext("asdf"), node1.name)
assert_wikicode_equal(wraptext("téstïng"), node2.name) assert_wikicode_equal(wraptext("téstïng"), node2.name)



def test_params(): def test_params():
"""test getter for the params attribute""" """test getter for the params attribute"""
node1 = Template(wraptext("foobar")) node1 = Template(wraptext("foobar"))
@@ -118,13 +141,14 @@ def test_params():
assert [] == node1.params assert [] == node1.params
assert plist is node2.params assert plist is node2.params



def test_has(): def test_has():
"""test Template.has()""" """test Template.has()"""
node1 = Template(wraptext("foobar")) node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("\nabc ", "def")])
node3 = Template(wraptext("foo"),
[pgenh("1", "a"), pgens("b", "c"), pgens("1", "d")])
node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("\nabc ", "def")])
node3 = Template(
wraptext("foo"), [pgenh("1", "a"), pgens("b", "c"), pgens("1", "d")]
)
node4 = Template(wraptext("foo"), [pgenh("1", "a"), pgens("b", " ")]) node4 = Template(wraptext("foo"), [pgenh("1", "a"), pgens("b", " ")])
assert node1.has("foobar", False) is False assert node1.has("foobar", False) is False
assert node2.has(1, False) is True assert node2.has(1, False) is True
@@ -138,6 +162,7 @@ def test_has():
assert node1.has_param("foobar", False) is False assert node1.has_param("foobar", False) is False
assert node2.has_param(1, False) is True assert node2.has_param(1, False) is True



def test_get(): def test_get():
"""test Template.get()""" """test Template.get()"""
node1 = Template(wraptext("foobar")) node1 = Template(wraptext("foobar"))
@@ -159,16 +184,15 @@ def test_get():
assert node3p2 is node3.get("1") assert node3p2 is node3.get("1")
assert node4p1 is node4.get("b ") assert node4p1 is node4.get("b ")



def test_add(): def test_add():
"""test Template.add()""" """test Template.add()"""
node1 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")]) node1 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node2 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")]) node2 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node3 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")]) node3 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node4 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")]) node4 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node5 = Template(wraptext("a"), [pgens("b", "c"),
pgens(" d ", "e")])
node6 = Template(wraptext("a"), [pgens("b", "c"), pgens("b", "d"),
pgens("b", "e")])
node5 = Template(wraptext("a"), [pgens("b", "c"), pgens(" d ", "e")])
node6 = Template(wraptext("a"), [pgens("b", "c"), pgens("b", "d"), pgens("b", "e")])
node7 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")]) node7 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "d")])
node8p = pgenh("1", "d") node8p = pgenh("1", "d")
node8 = Template(wraptext("a"), [pgens("b", "c"), node8p]) node8 = Template(wraptext("a"), [pgens("b", "c"), node8p])
@@ -176,48 +200,87 @@ def test_add():
node10 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "e")]) node10 = Template(wraptext("a"), [pgens("b", "c"), pgenh("1", "e")])
node11 = Template(wraptext("a"), [pgens("b", "c")]) node11 = Template(wraptext("a"), [pgens("b", "c")])
node12 = Template(wraptext("a"), [pgens("b", "c")]) node12 = Template(wraptext("a"), [pgens("b", "c")])
node13 = Template(wraptext("a"), [
pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")])
node14 = Template(wraptext("a\n"), [
pgens("b ", "c\n"), pgens("d ", " e"), pgens("f ", "g\n"),
pgens("h ", " i\n")])
node15 = Template(wraptext("a"), [
pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")])
node16 = Template(wraptext("a"), [
pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")])
node13 = Template(
wraptext("a"), [pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]
)
node14 = Template(
wraptext("a\n"),
[
pgens("b ", "c\n"),
pgens("d ", " e"),
pgens("f ", "g\n"),
pgens("h ", " i\n"),
],
)
node15 = Template(
wraptext("a"),
[pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")],
)
node16 = Template(
wraptext("a"), [pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]
)
node17 = Template(wraptext("a"), [pgenh("1", "b")]) node17 = Template(wraptext("a"), [pgenh("1", "b")])
node18 = Template(wraptext("a"), [pgenh("1", "b")]) node18 = Template(wraptext("a"), [pgenh("1", "b")])
node19 = Template(wraptext("a"), [pgenh("1", "b")]) node19 = Template(wraptext("a"), [pgenh("1", "b")])
node20 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"),
pgenh("3", "d"), pgenh("4", "e")])
node21 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"),
pgens("4", "d"), pgens("5", "e")])
node22 = Template(wraptext("a"), [pgenh("1", "b"), pgenh("2", "c"),
pgens("4", "d"), pgens("5", "e")])
node20 = Template(
wraptext("a"),
[pgenh("1", "b"), pgenh("2", "c"), pgenh("3", "d"), pgenh("4", "e")],
)
node21 = Template(
wraptext("a"),
[pgenh("1", "b"), pgenh("2", "c"), pgens("4", "d"), pgens("5", "e")],
)
node22 = Template(
wraptext("a"),
[pgenh("1", "b"), pgenh("2", "c"), pgens("4", "d"), pgens("5", "e")],
)
node23 = Template(wraptext("a"), [pgenh("1", "b")]) node23 = Template(wraptext("a"), [pgenh("1", "b")])
node24 = Template(wraptext("a"), [pgenh("1", "b")]) node24 = Template(wraptext("a"), [pgenh("1", "b")])
node25 = Template(wraptext("a"), [pgens("b", "c")]) node25 = Template(wraptext("a"), [pgens("b", "c")])
node26 = Template(wraptext("a"), [pgenh("1", "b")]) node26 = Template(wraptext("a"), [pgenh("1", "b")])
node27 = Template(wraptext("a"), [pgenh("1", "b")]) node27 = Template(wraptext("a"), [pgenh("1", "b")])
node28 = Template(wraptext("a"), [pgens("1", "b")]) node28 = Template(wraptext("a"), [pgens("1", "b")])
node29 = Template(wraptext("a"), [
pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")])
node30 = Template(wraptext("a\n"), [
pgens("b ", "c\n"), pgens("d ", " e"), pgens("f ", "g\n"),
pgens("h ", " i\n")])
node31 = Template(wraptext("a"), [
pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")])
node32 = Template(wraptext("a"), [
pgens("\nb ", " c "), pgens("\nd ", " e "), pgens("\nf ", " g ")])
node33 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"),
pgens("b", "f"), pgens("b", "h"),
pgens("i", "j")])
node34 = Template(wraptext("a"), [pgens("1", "b"), pgens("x", "y"),
pgens("1", "c"), pgens("2", "d")])
node35 = Template(wraptext("a"), [pgens("1", "b"), pgens("x", "y"),
pgenh("1", "c"), pgenh("2", "d")])
node36 = Template(wraptext("a"), [pgens("b", "c"), pgens("d", "e"),
pgens("f", "g")])
node29 = Template(
wraptext("a"), [pgens("\nb ", " c"), pgens("\nd ", " e"), pgens("\nf ", " g")]
)
node30 = Template(
wraptext("a\n"),
[
pgens("b ", "c\n"),
pgens("d ", " e"),
pgens("f ", "g\n"),
pgens("h ", " i\n"),
],
)
node31 = Template(
wraptext("a"),
[pgens("b ", " c\n"), pgens("\nd ", " e"), pgens("\nf ", "g ")],
)
node32 = Template(
wraptext("a"),
[pgens("\nb ", " c "), pgens("\nd ", " e "), pgens("\nf ", " g ")],
)
node33 = Template(
wraptext("a"),
[
pgens("b", "c"),
pgens("d", "e"),
pgens("b", "f"),
pgens("b", "h"),
pgens("i", "j"),
],
)
node34 = Template(
wraptext("a"),
[pgens("1", "b"), pgens("x", "y"), pgens("1", "c"), pgens("2", "d")],
)
node35 = Template(
wraptext("a"),
[pgens("1", "b"), pgens("x", "y"), pgenh("1", "c"), pgenh("2", "d")],
)
node36 = Template(
wraptext("a"), [pgens("b", "c"), pgens("d", "e"), pgens("f", "g")]
)
node37 = Template(wraptext("a"), [pgenh("1", "")]) node37 = Template(wraptext("a"), [pgenh("1", "")])
node38 = Template(wraptext("abc")) node38 = Template(wraptext("abc"))
node39 = Template(wraptext("a"), [pgenh("1", " b ")]) node39 = Template(wraptext("a"), [pgenh("1", " b ")])
@@ -320,65 +383,121 @@ def test_add():
assert "{{a|1= b|2= c|3= d}}" == node41 assert "{{a|1= b|2= c|3= d}}" == node41
assert "{{a|b=hello \n}}" == node42 assert "{{a|b=hello \n}}" == node42



def test_remove(): def test_remove():
"""test Template.remove()""" """test Template.remove()"""
node1 = Template(wraptext("foobar")) node1 = Template(wraptext("foobar"))
node2 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("abc", "def")])
node3 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgens("abc", "def")])
node4 = Template(wraptext("foo"),
[pgenh("1", "bar"), pgenh("2", "baz")])
node5 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node6 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node7 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgens(" 1", "b"), pgens("2", "c")])
node8 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgens(" 1", "b"), pgens("2", "c")])
node9 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node10 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node11 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node12 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node13 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node14 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node15 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node16 = Template(wraptext("foo"), [
pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")])
node17 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node18 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node19 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node20 = Template(wraptext("foo"), [
pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")])
node21 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node22 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node23 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node24 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node25 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node26 = Template(wraptext("foo"), [
pgens("a", "b"), pgens("c", "d"), pgens("e", "f"), pgens("a", "b"),
pgens("a", "b")])
node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("abc", "def")])
node3 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("abc", "def")])
node4 = Template(wraptext("foo"), [pgenh("1", "bar"), pgenh("2", "baz")])
node5 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node6 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node7 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgens(" 1", "b"), pgens("2", "c")]
)
node8 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgens(" 1", "b"), pgens("2", "c")]
)
node9 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node10 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node11 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node12 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node13 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node14 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node15 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node16 = Template(
wraptext("foo"), [pgens(" a", "b"), pgens("b", "c"), pgens("a ", "d")]
)
node17 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node18 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node19 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node20 = Template(
wraptext("foo"), [pgens("1 ", "a"), pgenh("1", "b"), pgenh("2", "c")]
)
node21 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node22 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node23 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node24 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node25 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node26 = Template(
wraptext("foo"),
[
pgens("a", "b"),
pgens("c", "d"),
pgens("e", "f"),
pgens("a", "b"),
pgens("a", "b"),
],
)
node27 = Template(wraptext("foo"), [pgenh("1", "bar")]) node27 = Template(wraptext("foo"), [pgenh("1", "bar")])
node28 = Template(wraptext("foo"), [pgenh("1", "bar")]) node28 = Template(wraptext("foo"), [pgenh("1", "bar")])


@@ -444,12 +563,14 @@ def test_remove():
with pytest.raises(ValueError): with pytest.raises(ValueError):
node27.remove(node28.get(1)) node27.remove(node28.get(1))



def test_formatting(): def test_formatting():
"""test realistic param manipulation with complex whitespace formatting """test realistic param manipulation with complex whitespace formatting
(assumes that parsing works correctly)""" (assumes that parsing works correctly)"""
tests = [ tests = [
# https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004
("""{{Infobox U.S. county
# https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004
(
"""{{Infobox U.S. county
| county = Lamar County | county = Lamar County
| state = Georgia | state = Georgia
| seal = | seal =
@@ -471,16 +592,17 @@ def test_formatting():
| district = 3rd | district = 3rd
| named for = [[Lucius Quintus Cincinnatus Lamar II]] | named for = [[Lucius Quintus Cincinnatus Lamar II]]
}}""", }}""",
"""@@ -11,4 +11,4 @@
"""@@ -11,4 +11,4 @@
| area percentage = 1.3% | area percentage = 1.3%
-| census yr = 2010 -| census yr = 2010
-| pop = 18317 -| pop = 18317
+| census estimate yr = 2016 +| census estimate yr = 2016
+| pop = 12345<ref>example ref</ref> +| pop = 12345<ref>example ref</ref>
| density_sq_mi = 100"""),

# https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760
("""{{Infobox U.S. County|
| density_sq_mi = 100""",
),
# https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760
(
"""{{Infobox U.S. County|
county = Rockdale County | county = Rockdale County |
state = Georgia | state = Georgia |
seal = | seal = |
@@ -500,16 +622,17 @@ def test_formatting():
| district = 4th | district = 4th
| time zone= Eastern | time zone= Eastern
}}""", }}""",
"""@@ -11,4 +11,4 @@
"""@@ -11,4 +11,4 @@
area percentage = 1.7% | area percentage = 1.7% |
- census yr = 2010| - census yr = 2010|
- pop = 85215 | - pop = 85215 |
+ census estimate yr = 2016 | + census estimate yr = 2016 |
+ pop = 12345<ref>example ref</ref> | + pop = 12345<ref>example ref</ref> |
density_sq_mi = 657 |"""),

# https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413
("""{{Infobox U.S. County|
density_sq_mi = 657 |""",
),
# https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413
(
"""{{Infobox U.S. County|
| county = Spalding County | | county = Spalding County |
| state = Georgia | | state = Georgia |
| seal = | | seal = |
@@ -530,16 +653,17 @@ def test_formatting():
| district = 3rd | district = 3rd
| time zone = Eastern | time zone = Eastern
}}""", }}""",
"""@@ -11,4 +11,4 @@
"""@@ -11,4 +11,4 @@
| area percentage = 1.6% | | area percentage = 1.6% |
-| census yr = 2010| -| census yr = 2010|
-| pop = 64073 | -| pop = 64073 |
+| +|
+| census estimate yr = 2016 | pop = 12345<ref>example ref</ref> | +| census estimate yr = 2016 | pop = 12345<ref>example ref</ref> |
| density_sq_mi = 326 |"""),

# https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648
("""{{Infobox U.S. county
| density_sq_mi = 326 |""",
),
# https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648
(
"""{{Infobox U.S. county
|county = Clinton County |county = Clinton County
|state = Illinois |state = Illinois
| ex image = File:Clinton County Courthouse, Carlyle.jpg | ex image = File:Clinton County Courthouse, Carlyle.jpg
@@ -560,16 +684,17 @@ def test_formatting():
|web = www.clintonco.illinois.gov |web = www.clintonco.illinois.gov
| district = 15th | district = 15th
}}""", }}""",
"""@@ -15,4 +15,4 @@
"""@@ -15,4 +15,4 @@
|area percentage = 5.8% |area percentage = 5.8%
- |census yr = 2010 - |census yr = 2010
- |pop = 37762 - |pop = 37762
+ |census estimate yr = 2016 + |census estimate yr = 2016
+ |pop = 12345<ref>example ref</ref> + |pop = 12345<ref>example ref</ref>
|density_sq_mi = 80"""),

# https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800
("""{{Infobox U.S. county |
|density_sq_mi = 80""",
),
# https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800
(
"""{{Infobox U.S. county |
county = Winnebago County | county = Winnebago County |
state = Illinois | state = Illinois |
seal = Winnebago County il seal.png | seal = Winnebago County il seal.png |
@@ -590,19 +715,21 @@ def test_formatting():
| district = 16th | district = 16th
| district2 = 17th | district2 = 17th
}}""", }}""",
"""@@ -11,4 +11,4 @@
"""@@ -11,4 +11,4 @@
area percentage = 1.1% | area percentage = 1.1% |
- census yr = 2010| - census yr = 2010|
- pop = 295266 | - pop = 295266 |
+ census estimate yr = 2016| + census estimate yr = 2016|
+ pop = 12345<ref>example ref</ref> | + pop = 12345<ref>example ref</ref> |
density_sq_mi = 575""")]
density_sq_mi = 575""",
),
]


for (original, expected) in tests: for (original, expected) in tests:
code = parse(original) code = parse(original)
template = code.filter_templates()[0] template = code.filter_templates()[0]
template.add("pop", "12345<ref>example ref</ref>") template.add("pop", "12345<ref>example ref</ref>")
template.add('census estimate yr', "2016", before="pop")
template.add("census estimate yr", "2016", before="pop")
template.remove("census yr") template.remove("census yr")


oldlines = original.splitlines(True) oldlines = original.splitlines(True)


+ 5
- 0
tests/test_text.py View File

@@ -26,6 +26,7 @@ import pytest


from mwparserfromhell.nodes import Text from mwparserfromhell.nodes import Text



def test_str(): def test_str():
"""test Text.__str__()""" """test Text.__str__()"""
node = Text("foobar") node = Text("foobar")
@@ -33,6 +34,7 @@ def test_str():
node2 = Text("fóóbar") node2 = Text("fóóbar")
assert "fóóbar" == str(node2) assert "fóóbar" == str(node2)



def test_children(): def test_children():
"""test Text.__children__()""" """test Text.__children__()"""
node = Text("foobar") node = Text("foobar")
@@ -40,11 +42,13 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen) next(gen)



def test_strip(): def test_strip():
"""test Text.__strip__()""" """test Text.__strip__()"""
node = Text("foobar") node = Text("foobar")
assert node is node.__strip__() assert node is node.__strip__()



def test_showtree(): def test_showtree():
"""test Text.__showtree__()""" """test Text.__showtree__()"""
output = [] output = []
@@ -57,6 +61,7 @@ def test_showtree():
res = ["foobar", r"f\xf3\xf3bar", "\\U00010332\\U0001033f\\U00010344"] res = ["foobar", r"f\xf3\xf3bar", "\\U00010332\\U0001033f\\U00010344"]
assert res == output assert res == output



def test_value(): def test_value():
"""test getter/setter for the value attribute""" """test getter/setter for the value attribute"""
node = Text("foobar") node = Text("foobar")


+ 26
- 14
tests/test_tokenizer.py View File

@@ -33,29 +33,32 @@ try:
except ImportError: except ImportError:
CTokenizer = None CTokenizer = None



class _TestParseError(Exception): class _TestParseError(Exception):
"""Raised internally when a test could not be parsed.""" """Raised internally when a test could not be parsed."""



def _parse_test(test, data): def _parse_test(test, data):
"""Parse an individual *test*, storing its info in *data*.""" """Parse an individual *test*, storing its info in *data*."""
for line in test.strip().splitlines(): for line in test.strip().splitlines():
if line.startswith("name:"): if line.startswith("name:"):
data["name"] = line[len("name:"):].strip()
data["name"] = line[len("name:") :].strip()
elif line.startswith("label:"): elif line.startswith("label:"):
data["label"] = line[len("label:"):].strip()
data["label"] = line[len("label:") :].strip()
elif line.startswith("input:"): elif line.startswith("input:"):
raw = line[len("input:"):].strip()
raw = line[len("input:") :].strip()
if raw[0] == '"' and raw[-1] == '"': if raw[0] == '"' and raw[-1] == '"':
raw = raw[1:-1] raw = raw[1:-1]
raw = raw.encode("raw_unicode_escape") raw = raw.encode("raw_unicode_escape")
data["input"] = raw.decode("unicode_escape") data["input"] = raw.decode("unicode_escape")
elif line.startswith("output:"): elif line.startswith("output:"):
raw = line[len("output:"):].strip()
raw = line[len("output:") :].strip()
try: try:
data["output"] = eval(raw, vars(tokens)) data["output"] = eval(raw, vars(tokens))
except Exception as err: except Exception as err:
raise _TestParseError(err) from err raise _TestParseError(err) from err



def _load_tests(filename, name, text): def _load_tests(filename, name, text):
"""Load all tests in *text* from the file *filename*.""" """Load all tests in *text* from the file *filename*."""
tests = text.split("\n---\n") tests = text.split("\n---\n")
@@ -77,15 +80,18 @@ def _load_tests(filename, name, text):
warnings.warn(error.format(filename)) warnings.warn(error.format(filename))
continue continue
if data["input"] is None or data["output"] is None: if data["input"] is None or data["output"] is None:
error = "Test '{}' in '{}' was ignored because it lacked an input or an output"
error = (
"Test '{}' in '{}' was ignored because it lacked an input or an output"
)
warnings.warn(error.format(data["name"], filename)) warnings.warn(error.format(data["name"], filename))
continue continue


# Include test filename in name # Include test filename in name
data['name'] = '{}:{}'.format(name, data['name'])
data["name"] = "{}:{}".format(name, data["name"])


yield data yield data



def build(): def build():
"""Load and install all tests from the 'tokenizer' directory.""" """Load and install all tests from the 'tokenizer' directory."""
directory = path.join(path.dirname(__file__), "tokenizer") directory = path.join(path.dirname(__file__), "tokenizer")
@@ -96,31 +102,37 @@ def build():
fullname = path.join(directory, filename) fullname = path.join(directory, filename)
with codecs.open(fullname, "r", encoding="utf8") as fp: with codecs.open(fullname, "r", encoding="utf8") as fp:
text = fp.read() text = fp.read()
name = path.split(fullname)[1][:-len(extension)]
name = path.split(fullname)[1][: -len(extension)]
yield from _load_tests(fullname, name, text) yield from _load_tests(fullname, name, text)


@pytest.mark.parametrize("tokenizer", filter(None, (
CTokenizer, PyTokenizer
)), ids=lambda t: 'CTokenizer' if t.USES_C else 'PyTokenizer')
@pytest.mark.parametrize("data", build(), ids=lambda data: data['name'])

@pytest.mark.parametrize(
"tokenizer",
filter(None, (CTokenizer, PyTokenizer)),
ids=lambda t: "CTokenizer" if t.USES_C else "PyTokenizer",
)
@pytest.mark.parametrize("data", build(), ids=lambda data: data["name"])
def test_tokenizer(tokenizer, data): def test_tokenizer(tokenizer, data):
expected = data["output"] expected = data["output"]
actual = tokenizer().tokenize(data["input"]) actual = tokenizer().tokenize(data["input"])
assert expected == actual assert expected == actual


@pytest.mark.parametrize("data", build(), ids=lambda data: data['name'])

@pytest.mark.parametrize("data", build(), ids=lambda data: data["name"])
def test_roundtrip(data): def test_roundtrip(data):
expected = data["input"] expected = data["input"]
actual = str(Builder().build(data["output"][:])) actual = str(Builder().build(data["output"][:]))
assert expected == actual assert expected == actual


@pytest.mark.skipif(CTokenizer is None, reason='CTokenizer not available')

@pytest.mark.skipif(CTokenizer is None, reason="CTokenizer not available")
def test_c_tokenizer_uses_c(): def test_c_tokenizer_uses_c():
"""make sure the C tokenizer identifies as using a C extension""" """make sure the C tokenizer identifies as using a C extension"""
assert CTokenizer.USES_C is True assert CTokenizer.USES_C is True
assert CTokenizer().USES_C is True assert CTokenizer().USES_C is True



def test_describe_context(): def test_describe_context():
assert "" == contexts.describe(0) assert "" == contexts.describe(0)
ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY|contexts.HAS_TEXT)
ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY | contexts.HAS_TEXT)
assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx

+ 9
- 5
tests/test_tokens.py View File

@@ -26,6 +26,7 @@ import pytest


from mwparserfromhell.parser import tokens from mwparserfromhell.parser import tokens



@pytest.mark.parametrize("name", tokens.__all__) @pytest.mark.parametrize("name", tokens.__all__)
def test_issubclass(name): def test_issubclass(name):
"""check that all classes within the tokens module are really Tokens""" """check that all classes within the tokens module are really Tokens"""
@@ -34,6 +35,7 @@ def test_issubclass(name):
assert isinstance(klass(), klass) assert isinstance(klass(), klass)
assert isinstance(klass(), tokens.Token) assert isinstance(klass(), tokens.Token)



def test_attributes(): def test_attributes():
"""check that Token attributes can be managed properly""" """check that Token attributes can be managed properly"""
token1 = tokens.Token() token1 = tokens.Token()
@@ -54,6 +56,7 @@ def test_attributes():
with pytest.raises(KeyError): with pytest.raises(KeyError):
token2.__delattr__("baz") token2.__delattr__("baz")



def test_repr(): def test_repr():
"""check that repr() on a Token works as expected""" """check that repr() on a Token works as expected"""
token1 = tokens.Token() token1 = tokens.Token()
@@ -65,6 +68,7 @@ def test_repr():
assert repr(token2) in ("Token(foo='bar', baz=123)", "Token(baz=123, foo='bar')") assert repr(token2) in ("Token(foo='bar', baz=123)", "Token(baz=123, foo='bar')")
assert "Text(text='" + hundredchars + "')" == repr(token3) assert "Text(text='" + hundredchars + "')" == repr(token3)



def test_equality(): def test_equality():
"""check that equivalent tokens are considered equal""" """check that equivalent tokens are considered equal"""
token1 = tokens.Token() token1 = tokens.Token()
@@ -83,11 +87,11 @@ def test_equality():
assert token4 != token6 assert token4 != token6
assert token5 != token6 assert token5 != token6


@pytest.mark.parametrize("token", [
tokens.Token(),
tokens.Token(foo="bar", baz=123),
tokens.Text(text="earwig")
])
@pytest.mark.parametrize(
"token",
[tokens.Token(), tokens.Token(foo="bar", baz=123), tokens.Text(text="earwig")],
)
def test_repr_equality(token): def test_repr_equality(token):
"""check that eval(repr(token)) == token""" """check that eval(repr(token)) == token"""
assert token == eval(repr(token), vars(tokens)) assert token == eval(repr(token), vars(tokens))

+ 23
- 18
tests/test_utils.py View File

@@ -28,28 +28,33 @@ from mwparserfromhell.nodes import Template, Text
from mwparserfromhell.utils import parse_anything from mwparserfromhell.utils import parse_anything
from .conftest import assert_wikicode_equal, wrap, wraptext from .conftest import assert_wikicode_equal, wrap, wraptext


@pytest.mark.parametrize("test,valid", [
(wraptext("foobar"), wraptext("foobar")),
(Template(wraptext("spam")), wrap([Template(wraptext("spam"))])),
("fóóbar", wraptext("fóóbar")),
(b"foob\xc3\xa1r", wraptext("foobár")),
(123, wraptext("123")),
(True, wraptext("True")),
(None, wrap([])),
([Text("foo"), Text("bar"), Text("baz")],
wraptext("foo", "bar", "baz")),
([wraptext("foo"), Text("bar"), "baz", 123, 456],
wraptext("foo", "bar", "baz", "123", "456")),
([[[([[((("foo",),),)], "bar"],)]]], wraptext("foo", "bar"))
])

@pytest.mark.parametrize(
"test,valid",
[
(wraptext("foobar"), wraptext("foobar")),
(Template(wraptext("spam")), wrap([Template(wraptext("spam"))])),
("fóóbar", wraptext("fóóbar")),
(b"foob\xc3\xa1r", wraptext("foobár")),
(123, wraptext("123")),
(True, wraptext("True")),
(None, wrap([])),
([Text("foo"), Text("bar"), Text("baz")], wraptext("foo", "bar", "baz")),
(
[wraptext("foo"), Text("bar"), "baz", 123, 456],
wraptext("foo", "bar", "baz", "123", "456"),
),
([[[([[((("foo",),),)], "bar"],)]]], wraptext("foo", "bar")),
],
)
def test_parse_anything_valid(test, valid): def test_parse_anything_valid(test, valid):
"""tests for valid input to utils.parse_anything()""" """tests for valid input to utils.parse_anything()"""
assert_wikicode_equal(valid, parse_anything(test)) assert_wikicode_equal(valid, parse_anything(test))


@pytest.mark.parametrize("invalid", [
Ellipsis, object, object(), type,
["foo", [object]]
])
@pytest.mark.parametrize(
"invalid", [Ellipsis, object, object(), type, ["foo", [object]]]
)
def test_parse_anything_invalid(invalid): def test_parse_anything_invalid(invalid):
"""tests for invalid input to utils.parse_anything()""" """tests for invalid input to utils.parse_anything()"""
with pytest.raises(ValueError): with pytest.raises(ValueError):


+ 140
- 67
tests/test_wikicode.py View File

@@ -34,6 +34,7 @@ from mwparserfromhell.wikicode import Wikicode
from mwparserfromhell import parse from mwparserfromhell import parse
from .conftest import wrap, wraptext from .conftest import wrap, wraptext



def test_str(): def test_str():
"""test Wikicode.__str__()""" """test Wikicode.__str__()"""
code1 = parse("foobar") code1 = parse("foobar")
@@ -41,6 +42,7 @@ def test_str():
assert "foobar" == str(code1) assert "foobar" == str(code1)
assert "Have a {{template}} and a [[page|link]]" == str(code2) assert "Have a {{template}} and a [[page|link]]" == str(code2)



def test_nodes(): def test_nodes():
"""test getter/setter for the nodes attribute""" """test getter/setter for the nodes attribute"""
code = parse("Have a {{template}}") code = parse("Have a {{template}}")
@@ -57,6 +59,7 @@ def test_nodes():
with pytest.raises(ValueError): with pytest.raises(ValueError):
code.__setattr__("nodes", object) code.__setattr__("nodes", object)



def test_get(): def test_get():
"""test Wikicode.get()""" """test Wikicode.get()"""
code = parse("Have a {{template}} and a [[page|link]]") code = parse("Have a {{template}} and a [[page|link]]")
@@ -65,6 +68,7 @@ def test_get():
with pytest.raises(IndexError): with pytest.raises(IndexError):
code.get(4) code.get(4)



def test_set(): def test_set():
"""test Wikicode.set()""" """test Wikicode.set()"""
code = parse("Have a {{template}} and a [[page|link]]") code = parse("Have a {{template}} and a [[page|link]]")
@@ -82,6 +86,7 @@ def test_set():
with pytest.raises(IndexError): with pytest.raises(IndexError):
code.set(-4, "{{baz}}") code.set(-4, "{{baz}}")



def test_contains(): def test_contains():
"""test Wikicode.contains()""" """test Wikicode.contains()"""
code = parse("Here is {{aaa|{{bbb|xyz{{ccc}}}}}} and a [[page|link]]") code = parse("Here is {{aaa|{{bbb|xyz{{ccc}}}}}} and a [[page|link]]")
@@ -93,6 +98,7 @@ def test_contains():
assert code.contains(str(tmpl4)) is True assert code.contains(str(tmpl4)) is True
assert code.contains(tmpl2.params[0].value) is True assert code.contains(tmpl2.params[0].value) is True



def test_index(): def test_index():
"""test Wikicode.index()""" """test Wikicode.index()"""
code = parse("Have a {{template}} and a [[page|link]]") code = parse("Have a {{template}} and a [[page|link]]")
@@ -105,13 +111,13 @@ def test_index():
code = parse("{{foo}}{{bar|{{baz}}}}") code = parse("{{foo}}{{bar|{{baz}}}}")
assert 1 == code.index("{{bar|{{baz}}}}") assert 1 == code.index("{{bar|{{baz}}}}")
assert 1 == code.index("{{baz}}", recursive=True) assert 1 == code.index("{{baz}}", recursive=True)
assert 1 == code.index(code.get(1).get(1).value,
recursive=True)
assert 1 == code.index(code.get(1).get(1).value, recursive=True)
with pytest.raises(ValueError): with pytest.raises(ValueError):
code.index("{{baz}}", recursive=False) code.index("{{baz}}", recursive=False)
with pytest.raises(ValueError): with pytest.raises(ValueError):
code.index(code.get(1).get(1).value, recursive=False) code.index(code.get(1).get(1).value, recursive=False)



def test_get_ancestors_parent(): def test_get_ancestors_parent():
"""test Wikicode.get_ancestors() and Wikicode.get_parent()""" """test Wikicode.get_ancestors() and Wikicode.get_parent()"""
code = parse("{{a|{{b|{{d|{{e}}{{f}}}}{{g}}}}}}{{c}}") code = parse("{{a|{{b|{{d|{{e}}{{f}}}}{{g}}}}}}{{c}}")
@@ -130,6 +136,7 @@ def test_get_ancestors_parent():
with pytest.raises(ValueError): with pytest.raises(ValueError):
code.get_parent(fake) code.get_parent(fake)



def test_insert(): def test_insert():
"""test Wikicode.insert()""" """test Wikicode.insert()"""
code = parse("Have a {{template}} and a [[page|link]]") code = parse("Have a {{template}} and a [[page|link]]")
@@ -144,14 +151,22 @@ def test_insert():
code2 = parse("{{foo}}{{bar}}{{baz}}") code2 = parse("{{foo}}{{bar}}{{baz}}")
code2.insert(1, "abc{{def}}ghi[[jk]]") code2.insert(1, "abc{{def}}ghi[[jk]]")
assert "{{foo}}abc{{def}}ghi[[jk]]{{bar}}{{baz}}" == code2 assert "{{foo}}abc{{def}}ghi[[jk]]{{bar}}{{baz}}" == code2
assert ["{{foo}}", "abc", "{{def}}", "ghi", "[[jk]]",
"{{bar}}", "{{baz}}"] == code2.nodes
assert [
"{{foo}}",
"abc",
"{{def}}",
"ghi",
"[[jk]]",
"{{bar}}",
"{{baz}}",
] == code2.nodes


code3 = parse("{{foo}}bar") code3 = parse("{{foo}}bar")
code3.insert(1000, "[[baz]]") code3.insert(1000, "[[baz]]")
code3.insert(-1000, "derp") code3.insert(-1000, "derp")
assert "derp{{foo}}bar[[baz]]" == code3 assert "derp{{foo}}bar[[baz]]" == code3



def _test_search(meth, expected): def _test_search(meth, expected):
"""Base test for insert_before(), insert_after(), and replace().""" """Base test for insert_before(), insert_after(), and replace()."""
code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}") code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}")
@@ -249,6 +264,7 @@ def _test_search(meth, expected):
meth(code9, code9.get_sections()[0], "{{quz}}") meth(code9, code9.get_sections()[0], "{{quz}}")
assert expected[8] == code9 assert expected[8] == code9



def test_insert_before(): def test_insert_before():
"""test Wikicode.insert_before()""" """test Wikicode.insert_before()"""
meth = lambda code, *args, **kw: code.insert_before(*args, **kw) meth = lambda code, *args, **kw: code.insert_before(*args, **kw)
@@ -265,6 +281,7 @@ def test_insert_before():
] ]
_test_search(meth, expected) _test_search(meth, expected)



def test_insert_after(): def test_insert_after():
"""test Wikicode.insert_after()""" """test Wikicode.insert_after()"""
meth = lambda code, *args, **kw: code.insert_after(*args, **kw) meth = lambda code, *args, **kw: code.insert_after(*args, **kw)
@@ -281,6 +298,7 @@ def test_insert_after():
] ]
_test_search(meth, expected) _test_search(meth, expected)



def test_replace(): def test_replace():
"""test Wikicode.replace()""" """test Wikicode.replace()"""
meth = lambda code, *args, **kw: code.replace(*args, **kw) meth = lambda code, *args, **kw: code.replace(*args, **kw)
@@ -297,6 +315,7 @@ def test_replace():
] ]
_test_search(meth, expected) _test_search(meth, expected)



def test_append(): def test_append():
"""test Wikicode.append()""" """test Wikicode.append()"""
code = parse("Have a {{template}}") code = parse("Have a {{template}}")
@@ -310,6 +329,7 @@ def test_append():
with pytest.raises(ValueError): with pytest.raises(ValueError):
code.append(slice(0, 1)) code.append(slice(0, 1))



def test_remove(): def test_remove():
"""test Wikicode.remove()""" """test Wikicode.remove()"""
meth = lambda code, obj, value, **kw: code.remove(obj, **kw) meth = lambda code, obj, value, **kw: code.remove(obj, **kw)
@@ -326,6 +346,7 @@ def test_remove():
] ]
_test_search(meth, expected) _test_search(meth, expected)



def test_matches(): def test_matches():
"""test Wikicode.matches()""" """test Wikicode.matches()"""
code1 = parse("Cleanup") code1 = parse("Cleanup")
@@ -357,17 +378,32 @@ def test_matches():
assert code5.matches("<!-- nothing -->") is True assert code5.matches("<!-- nothing -->") is True
assert code5.matches(("a", "b", "")) is True assert code5.matches(("a", "b", "")) is True



def test_filter_family(): def test_filter_family():
"""test the Wikicode.i?filter() family of functions""" """test the Wikicode.i?filter() family of functions"""

def genlist(gen): def genlist(gen):
assert isinstance(gen, GeneratorType) assert isinstance(gen, GeneratorType)
return list(gen) return list(gen)

ifilter = lambda code: (lambda *a, **k: genlist(code.ifilter(*a, **k))) ifilter = lambda code: (lambda *a, **k: genlist(code.ifilter(*a, **k)))


code = parse("a{{b}}c[[d]]{{{e}}}{{f}}[[g]]") code = parse("a{{b}}c[[d]]{{{e}}}{{f}}[[g]]")
for func in (code.filter, ifilter(code)): for func in (code.filter, ifilter(code)):
assert ["a", "{{b}}", "b", "c", "[[d]]", "d", "{{{e}}}",
"e", "{{f}}", "f", "[[g]]", "g"] == func()
assert [
"a",
"{{b}}",
"b",
"c",
"[[d]]",
"d",
"{{{e}}}",
"e",
"{{f}}",
"f",
"[[g]]",
"g",
] == func()
assert ["{{{e}}}"] == func(forcetype=Argument) assert ["{{{e}}}"] == func(forcetype=Argument)
assert code.get(4) is func(forcetype=Argument)[0] assert code.get(4) is func(forcetype=Argument)[0]
assert list("abcdefg") == func(forcetype=Text) assert list("abcdefg") == func(forcetype=Text)
@@ -377,7 +413,7 @@ def test_filter_family():


funcs = [ funcs = [
lambda name, **kw: getattr(code, "filter_" + name)(**kw), lambda name, **kw: getattr(code, "filter_" + name)(**kw),
lambda name, **kw: genlist(getattr(code, "ifilter_" + name)(**kw))
lambda name, **kw: genlist(getattr(code, "ifilter_" + name)(**kw)),
] ]
for get_filter in funcs: for get_filter in funcs:
assert ["{{{e}}}"] == get_filter("arguments") assert ["{{{e}}}"] == get_filter("arguments")
@@ -393,27 +429,35 @@ def test_filter_family():


code2 = parse("{{a|{{b}}|{{c|d={{f}}{{h}}}}}}") code2 = parse("{{a|{{b}}|{{c|d={{f}}{{h}}}}}}")
for func in (code2.filter, ifilter(code2)): for func in (code2.filter, ifilter(code2)):
assert ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"] \
== func(recursive=False, forcetype=Template)
assert ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}", "{{b}}",
"{{c|d={{f}}{{h}}}}", "{{f}}", "{{h}}"] \
== func(recursive=True, forcetype=Template)
assert ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"] == func(
recursive=False, forcetype=Template
)
assert [
"{{a|{{b}}|{{c|d={{f}}{{h}}}}}}",
"{{b}}",
"{{c|d={{f}}{{h}}}}",
"{{f}}",
"{{h}}",
] == func(recursive=True, forcetype=Template)


code3 = parse("{{foobar}}{{FOO}}{{baz}}{{bz}}{{barfoo}}") code3 = parse("{{foobar}}{{FOO}}{{baz}}{{bz}}{{barfoo}}")
for func in (code3.filter, ifilter(code3)): for func in (code3.filter, ifilter(code3)):
assert ["{{foobar}}", "{{barfoo}}"] \
== func(False, matches=lambda node: "foo" in node)
assert ["{{foobar}}", "{{FOO}}", "{{barfoo}}"] \
== func(False, matches=r"foo")
assert ["{{foobar}}", "{{FOO}}"] \
== func(matches=r"^{{foo.*?}}")
assert ["{{foobar}}"] \
== func(matches=r"^{{foo.*?}}", flags=re.UNICODE)
assert ["{{foobar}}", "{{barfoo}}"] == func(
False, matches=lambda node: "foo" in node
)
assert ["{{foobar}}", "{{FOO}}", "{{barfoo}}"] == func(False, matches=r"foo")
assert ["{{foobar}}", "{{FOO}}"] == func(matches=r"^{{foo.*?}}")
assert ["{{foobar}}"] == func(matches=r"^{{foo.*?}}", flags=re.UNICODE)
assert ["{{baz}}", "{{bz}}"] == func(matches=r"^{{b.*?z") assert ["{{baz}}", "{{bz}}"] == func(matches=r"^{{b.*?z")
assert ["{{baz}}"] == func(matches=r"^{{b.+?z}}") assert ["{{baz}}"] == func(matches=r"^{{b.+?z}}")


exp_rec = ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}", "{{b}}",
"{{c|d={{f}}{{h}}}}", "{{f}}", "{{h}}"]
exp_rec = [
"{{a|{{b}}|{{c|d={{f}}{{h}}}}}}",
"{{b}}",
"{{c|d={{f}}{{h}}}}",
"{{f}}",
"{{h}}",
]
exp_unrec = ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"] exp_unrec = ["{{a|{{b}}|{{c|d={{f}}{{h}}}}}}"]
assert exp_rec == code2.filter_templates() assert exp_rec == code2.filter_templates()
assert exp_unrec == code2.filter_templates(recursive=False) assert exp_unrec == code2.filter_templates(recursive=False)
@@ -422,9 +466,9 @@ def test_filter_family():
assert exp_unrec == code2.filter_templates(False) assert exp_unrec == code2.filter_templates(False)


assert ["{{foobar}}"] == code3.filter_templates( assert ["{{foobar}}"] == code3.filter_templates(
matches=lambda node: node.name.matches("Foobar"))
assert ["{{baz}}", "{{bz}}"] \
== code3.filter_templates(matches=r"^{{b.*?z")
matches=lambda node: node.name.matches("Foobar")
)
assert ["{{baz}}", "{{bz}}"] == code3.filter_templates(matches=r"^{{b.*?z")
assert [] == code3.filter_tags(matches=r"^{{b.*?z") assert [] == code3.filter_tags(matches=r"^{{b.*?z")
assert [] == code3.filter_tags(matches=r"^{{b.*?z", flags=0) assert [] == code3.filter_tags(matches=r"^{{b.*?z", flags=0)
with pytest.raises(TypeError): with pytest.raises(TypeError):
@@ -440,6 +484,7 @@ def test_filter_family():
assert ["{{foo}}", "{{foo|{{bar}}}}"] == actual1 assert ["{{foo}}", "{{foo|{{bar}}}}"] == actual1
assert ["{{foo}}", "{{foo|{{bar}}}}"] == actual2 assert ["{{foo}}", "{{foo|{{bar}}}}"] == actual2



def test_get_sections(): def test_get_sections():
"""test Wikicode.get_sections()""" """test Wikicode.get_sections()"""
page1 = parse("") page1 = parse("")
@@ -461,44 +506,70 @@ def test_get_sections():


assert [""] == page1.get_sections() assert [""] == page1.get_sections()
assert ["", "==Heading=="] == page2.get_sections() assert ["", "==Heading=="] == page2.get_sections()
assert ["", "===Heading===\nFoo bar baz\n====Gnidaeh====\n", "====Gnidaeh====\n"] \
== page3.get_sections()
assert [p4_lead, p4_I, p4_IA, p4_IB, p4_IB1, p4_II,
p4_III, p4_IIIA, p4_IIIA1a, p4_IIIA2, p4_IIIA2ai1] \
== page4.get_sections()
assert [
"",
"===Heading===\nFoo bar baz\n====Gnidaeh====\n",
"====Gnidaeh====\n",
] == page3.get_sections()
assert [
p4_lead,
p4_I,
p4_IA,
p4_IB,
p4_IB1,
p4_II,
p4_III,
p4_IIIA,
p4_IIIA1a,
p4_IIIA2,
p4_IIIA2ai1,
] == page4.get_sections()


assert ["====Gnidaeh====\n"] == page3.get_sections(levels=[4]) assert ["====Gnidaeh====\n"] == page3.get_sections(levels=[4])
assert ["===Heading===\nFoo bar baz\n====Gnidaeh====\n"] \
== page3.get_sections(levels=(2, 3))
assert ["===Heading===\nFoo bar baz\n"] \
== page3.get_sections(levels=(2, 3), flat=True)
assert ["===Heading===\nFoo bar baz\n====Gnidaeh====\n"] == page3.get_sections(
levels=(2, 3)
)
assert ["===Heading===\nFoo bar baz\n"] == page3.get_sections(
levels=(2, 3), flat=True
)
assert [] == page3.get_sections(levels=[0]) assert [] == page3.get_sections(levels=[0])
assert ["", "====Gnidaeh====\n"] == page3.get_sections(levels=[4], include_lead=True)
assert ["===Heading===\nFoo bar baz\n====Gnidaeh====\n",
"====Gnidaeh====\n"] == page3.get_sections(include_lead=False)
assert ["===Heading===\nFoo bar baz\n", "====Gnidaeh====\n"] \
== page3.get_sections(flat=True, include_lead=False)
assert ["", "====Gnidaeh====\n"] == page3.get_sections(
levels=[4], include_lead=True
)
assert [
"===Heading===\nFoo bar baz\n====Gnidaeh====\n",
"====Gnidaeh====\n",
] == page3.get_sections(include_lead=False)
assert ["===Heading===\nFoo bar baz\n", "====Gnidaeh====\n"] == page3.get_sections(
flat=True, include_lead=False
)


assert [p4_IB1, p4_IIIA2] == page4.get_sections(levels=[4]) assert [p4_IB1, p4_IIIA2] == page4.get_sections(levels=[4])
assert [p4_IA, p4_IB, p4_IIIA] == page4.get_sections(levels=[3]) assert [p4_IA, p4_IB, p4_IIIA] == page4.get_sections(levels=[3])
assert [p4_IA, "=== Section I.B ===\n",
"=== Section III.A ===\nText.\n"] \
== page4.get_sections(levels=[3], flat=True)
assert [
p4_IA,
"=== Section I.B ===\n",
"=== Section III.A ===\nText.\n",
] == page4.get_sections(levels=[3], flat=True)
assert ["", ""] == page2.get_sections(include_headings=False) assert ["", ""] == page2.get_sections(include_headings=False)
assert ["\nSection I.B.1 body.\n\n&bull;Some content.\n\n",
"\nEven more text.\n" + p4_IIIA2ai1] \
== page4.get_sections(levels=[4], include_headings=False)
assert [
"\nSection I.B.1 body.\n\n&bull;Some content.\n\n",
"\nEven more text.\n" + p4_IIIA2ai1,
] == page4.get_sections(levels=[4], include_headings=False)


assert [] == page4.get_sections(matches=r"body") assert [] == page4.get_sections(matches=r"body")
assert [p4_I, p4_IA, p4_IB, p4_IB1] \
== page4.get_sections(matches=r"Section\sI[.\s].*?")
assert [p4_IA, p4_IIIA, p4_IIIA1a, p4_IIIA2, p4_IIIA2ai1] \
== page4.get_sections(matches=r".*?a.*?")
assert [p4_IIIA1a, p4_IIIA2ai1] \
== page4.get_sections(matches=r".*?a.*?", flags=re.U)
assert ["\nMore text.\n", "\nAn invalid section!"] \
== page4.get_sections(matches=r".*?a.*?", flags=re.U,
include_headings=False)
assert [p4_I, p4_IA, p4_IB, p4_IB1] == page4.get_sections(
matches=r"Section\sI[.\s].*?"
)
assert [p4_IA, p4_IIIA, p4_IIIA1a, p4_IIIA2, p4_IIIA2ai1] == page4.get_sections(
matches=r".*?a.*?"
)
assert [p4_IIIA1a, p4_IIIA2ai1] == page4.get_sections(
matches=r".*?a.*?", flags=re.U
)
assert ["\nMore text.\n", "\nAn invalid section!"] == page4.get_sections(
matches=r".*?a.*?", flags=re.U, include_headings=False
)


sections = page2.get_sections(include_headings=False) sections = page2.get_sections(include_headings=False)
sections[0].append("Lead!\n") sections[0].append("Lead!\n")
@@ -512,22 +583,22 @@ def test_get_sections():
assert "== Foo ==\nBarf {{Haha}}\n" == section assert "== Foo ==\nBarf {{Haha}}\n" == section
assert "X\n== Foo ==\nBarf {{Haha}}\n== Baz ==\nBuzz" == page5 assert "X\n== Foo ==\nBarf {{Haha}}\n== Baz ==\nBuzz" == page5



def test_strip_code(): def test_strip_code():
"""test Wikicode.strip_code()""" """test Wikicode.strip_code()"""
# Since individual nodes have test cases for their __strip__ methods, # Since individual nodes have test cases for their __strip__ methods,
# we're only going to do an integration test: # we're only going to do an integration test:
code = parse("Foo [[bar]]\n\n{{baz|hello}}\n\n[[a|b]] &Sigma;") code = parse("Foo [[bar]]\n\n{{baz|hello}}\n\n[[a|b]] &Sigma;")
assert "Foo bar\n\nb Σ" \
== code.strip_code(normalize=True, collapse=True)
assert "Foo bar\n\n\n\nb Σ" \
== code.strip_code(normalize=True, collapse=False)
assert "Foo bar\n\nb &Sigma;" \
== code.strip_code(normalize=False, collapse=True)
assert "Foo bar\n\n\n\nb &Sigma;" \
== code.strip_code(normalize=False, collapse=False)
assert "Foo bar\n\nhello\n\nb Σ" \
== code.strip_code(normalize=True, collapse=True,
keep_template_params=True)
assert "Foo bar\n\nb Σ" == code.strip_code(normalize=True, collapse=True)
assert "Foo bar\n\n\n\nb Σ" == code.strip_code(normalize=True, collapse=False)
assert "Foo bar\n\nb &Sigma;" == code.strip_code(normalize=False, collapse=True)
assert "Foo bar\n\n\n\nb &Sigma;" == code.strip_code(
normalize=False, collapse=False
)
assert "Foo bar\n\nhello\n\nb Σ" == code.strip_code(
normalize=True, collapse=True, keep_template_params=True
)



def test_get_tree(): def test_get_tree():
"""test Wikicode.get_tree()""" """test Wikicode.get_tree()"""
@@ -535,6 +606,8 @@ def test_get_tree():
# methods, and the docstring covers all possibilities for the output of # methods, and the docstring covers all possibilities for the output of
# __showtree__, we'll test it only: # __showtree__, we'll test it only:
code = parse("Lorem ipsum {{foo|bar|{{baz}}|spam=eggs}}") code = parse("Lorem ipsum {{foo|bar|{{baz}}|spam=eggs}}")
expected = "Lorem ipsum \n{{\n\t foo\n\t| 1\n\t= bar\n\t| 2\n\t= " + \
"{{\n\t\t\tbaz\n\t }}\n\t| spam\n\t= eggs\n}}"
expected = (
"Lorem ipsum \n{{\n\t foo\n\t| 1\n\t= bar\n\t| 2\n\t= "
+ "{{\n\t\t\tbaz\n\t }}\n\t| spam\n\t= eggs\n}}"
)
assert expected.expandtabs(4) == code.get_tree() assert expected.expandtabs(4) == code.get_tree()

+ 16
- 2
tests/test_wikilink.py View File

@@ -27,6 +27,7 @@ import pytest
from mwparserfromhell.nodes import Text, Wikilink from mwparserfromhell.nodes import Text, Wikilink
from .conftest import assert_wikicode_equal, wrap, wraptext from .conftest import assert_wikicode_equal, wrap, wraptext



def test_str(): def test_str():
"""test Wikilink.__str__()""" """test Wikilink.__str__()"""
node = Wikilink(wraptext("foobar")) node = Wikilink(wraptext("foobar"))
@@ -34,6 +35,7 @@ def test_str():
node2 = Wikilink(wraptext("foo"), wraptext("bar")) node2 = Wikilink(wraptext("foo"), wraptext("bar"))
assert "[[foo|bar]]" == str(node2) assert "[[foo|bar]]" == str(node2)



def test_children(): def test_children():
"""test Wikilink.__children__()""" """test Wikilink.__children__()"""
node1 = Wikilink(wraptext("foobar")) node1 = Wikilink(wraptext("foobar"))
@@ -48,6 +50,7 @@ def test_children():
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(gen2) next(gen2)



def test_strip(): def test_strip():
"""test Wikilink.__strip__()""" """test Wikilink.__strip__()"""
node = Wikilink(wraptext("foobar")) node = Wikilink(wraptext("foobar"))
@@ -55,6 +58,7 @@ def test_strip():
assert "foobar" == node.__strip__() assert "foobar" == node.__strip__()
assert "bar" == node2.__strip__() assert "bar" == node2.__strip__()



def test_showtree(): def test_showtree():
"""test Wikilink.__showtree__()""" """test Wikilink.__showtree__()"""
output = [] output = []
@@ -66,10 +70,19 @@ def test_showtree():
node1.__showtree__(output.append, get, mark) node1.__showtree__(output.append, get, mark)
node2.__showtree__(output.append, get, mark) node2.__showtree__(output.append, get, mark)
valid = [ valid = [
"[[", (getter, node1.title), "]]", "[[", (getter, node2.title),
" | ", marker, (getter, node2.text), "]]"]
"[[",
(getter, node1.title),
"]]",
"[[",
(getter, node2.title),
" | ",
marker,
(getter, node2.text),
"]]",
]
assert valid == output assert valid == output



def test_title(): def test_title():
"""test getter/setter for the title attribute""" """test getter/setter for the title attribute"""
title = wraptext("foobar") title = wraptext("foobar")
@@ -82,6 +95,7 @@ def test_title():
assert_wikicode_equal(wraptext("héhehé"), node1.title) assert_wikicode_equal(wraptext("héhehé"), node1.title)
assert_wikicode_equal(wraptext("héhehé"), node2.title) assert_wikicode_equal(wraptext("héhehé"), node2.title)



def test_text(): def test_text():
"""test getter/setter for the text attribute""" """test getter/setter for the text attribute"""
text = wraptext("baz") text = wraptext("baz")


Loading…
Cancel
Save