Browse Source

Autoformat: black + clang-format + clang-tidy

tags/v0.6.3
Ben Kurtovic 8 months ago
parent
commit
8cd0bdb322
68 changed files with 4288 additions and 2698 deletions
  1. +13
    -0
      .clang-format
  2. +1
    -0
      .gitignore
  3. +11
    -6
      scripts/memtest.py
  4. +32
    -23
      setup.py
  5. +1
    -2
      src/mwparserfromhell/__init__.py
  6. +16
    -4
      src/mwparserfromhell/definitions.py
  7. +12
    -2
      src/mwparserfromhell/nodes/__init__.py
  8. +2
    -0
      src/mwparserfromhell/nodes/_base.py
  9. +1
    -0
      src/mwparserfromhell/nodes/argument.py
  10. +1
    -0
      src/mwparserfromhell/nodes/comment.py
  11. +2
    -0
      src/mwparserfromhell/nodes/external_link.py
  12. +10
    -2
      src/mwparserfromhell/nodes/extras/attribute.py
  13. +2
    -2
      src/mwparserfromhell/nodes/extras/parameter.py
  14. +1
    -0
      src/mwparserfromhell/nodes/heading.py
  15. +12
    -5
      src/mwparserfromhell/nodes/html_entity.py
  16. +32
    -8
      src/mwparserfromhell/nodes/tag.py
  17. +7
    -6
      src/mwparserfromhell/nodes/template.py
  18. +1
    -0
      src/mwparserfromhell/nodes/text.py
  19. +1
    -0
      src/mwparserfromhell/nodes/wikilink.py
  20. +5
    -0
      src/mwparserfromhell/parser/__init__.py
  21. +57
    -20
      src/mwparserfromhell/parser/builder.py
  22. +59
    -34
      src/mwparserfromhell/parser/contexts.py
  23. +525
    -510
      src/mwparserfromhell/parser/ctokenizer/avl_tree.c
  24. +166
    -177
      src/mwparserfromhell/parser/ctokenizer/avl_tree.h
  25. +34
    -33
      src/mwparserfromhell/parser/ctokenizer/common.h
  26. +11
    -5
      src/mwparserfromhell/parser/ctokenizer/contexts.h
  27. +31
    -23
      src/mwparserfromhell/parser/ctokenizer/definitions.c
  28. +5
    -6
      src/mwparserfromhell/parser/ctokenizer/definitions.h
  29. +20
    -14
      src/mwparserfromhell/parser/ctokenizer/tag_data.c
  30. +6
    -6
      src/mwparserfromhell/parser/ctokenizer/tag_data.h
  31. +45
    -26
      src/mwparserfromhell/parser/ctokenizer/textbuffer.c
  32. +8
    -8
      src/mwparserfromhell/parser/ctokenizer/textbuffer.h
  33. +966
    -685
      src/mwparserfromhell/parser/ctokenizer/tok_parse.c
  34. +4
    -3
      src/mwparserfromhell/parser/ctokenizer/tok_parse.h
  35. +122
    -79
      src/mwparserfromhell/parser/ctokenizer/tok_support.c
  36. +28
    -31
      src/mwparserfromhell/parser/ctokenizer/tok_support.h
  37. +94
    -67
      src/mwparserfromhell/parser/ctokenizer/tokenizer.c
  38. +56
    -47
      src/mwparserfromhell/parser/ctokenizer/tokenizer.h
  39. +42
    -44
      src/mwparserfromhell/parser/ctokenizer/tokens.c
  40. +38
    -38
      src/mwparserfromhell/parser/ctokenizer/tokens.h
  41. +2
    -0
      src/mwparserfromhell/parser/errors.py
  42. +126
    -52
      src/mwparserfromhell/parser/tokenizer.py
  43. +30
    -28
      src/mwparserfromhell/parser/tokens.py
  44. +4
    -4
      src/mwparserfromhell/smart_list/list_proxy.py
  45. +5
    -2
      src/mwparserfromhell/string_mixin.py
  46. +5
    -2
      src/mwparserfromhell/utils.py
  47. +57
    -22
      src/mwparserfromhell/wikicode.py
  48. +22
    -2
      tests/conftest.py
  49. +16
    -2
      tests/test_argument.py
  50. +5
    -0
      tests/test_attribute.py
  51. +737
    -326
      tests/test_builder.py
  52. +5
    -0
      tests/test_comment.py
  53. +12
    -6
      tests/test_docs.py
  54. +12
    -7
      tests/test_external_link.py
  55. +7
    -2
      tests/test_heading.py
  56. +9
    -0
      tests/test_html_entity.py
  57. +4
    -0
      tests/test_parameter.py
  58. +43
    -19
      tests/test_parser.py
  59. +27
    -2
      tests/test_smart_list.py
  60. +95
    -20
      tests/test_string_mixin.py
  61. +99
    -40
      tests/test_tag.py
  62. +267
    -140
      tests/test_template.py
  63. +5
    -0
      tests/test_text.py
  64. +26
    -14
      tests/test_tokenizer.py
  65. +9
    -5
      tests/test_tokens.py
  66. +23
    -18
      tests/test_utils.py
  67. +140
    -67
      tests/test_wikicode.py
  68. +16
    -2
      tests/test_wikilink.py

+ 13
- 0
.clang-format View File

@@ -0,0 +1,13 @@
BasedOnStyle: LLVM
AlignConsecutiveMacros: AcrossEmptyLines
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakAfterReturnType: TopLevelDefinitions
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Linux
ColumnLimit: 88
IndentPPDirectives: AfterHash
IndentWidth: 4
SpaceAfterCStyleCast: true
StatementMacros:
- PyObject_HEAD

+ 1
- 0
.gitignore View File

@@ -13,5 +13,6 @@ dist
docs/_build
scripts/*.log
htmlcov/
compile_commands.json
.idea/
.pytest_cache/

+ 11
- 6
scripts/memtest.py View File

@@ -41,6 +41,7 @@ from mwparserfromhell.parser._tokenizer import CTokenizer

LOOPS = 10000


class Color:
GRAY = "\x1b[30;1m"
GREEN = "\x1b[92m"
@@ -63,11 +64,11 @@ class MemoryTest:
data = {"name": None, "label": None, "input": None, "output": None}
for line in test.strip().splitlines():
if line.startswith("name:"):
data["name"] = line[len("name:"):].strip()
data["name"] = line[len("name:") :].strip()
elif line.startswith("label:"):
data["label"] = line[len("label:"):].strip()
data["label"] = line[len("label:") :].strip()
elif line.startswith("input:"):
raw = line[len("input:"):].strip()
raw = line[len("input:") :].strip()
if raw[0] == '"' and raw[-1] == '"':
raw = raw[1:-1]
raw = raw.encode("raw_unicode_escape")
@@ -81,7 +82,7 @@ class MemoryTest:
def load_file(filename):
with open(filename, "rU") as fp:
text = fp.read()
name = path.split(filename)[1][:0-len(extension)]
name = path.split(filename)[1][: 0 - len(extension)]
self._parse_file(name, text)

root = path.split(path.dirname(path.abspath(__file__)))[0]
@@ -119,8 +120,11 @@ class MemoryTest:

tmpl = "{0}[{1:03}/{2}]{3} {4}: "
for i, (name, text) in enumerate(self._tests, 1):
sys.stdout.write(tmpl.format(Color.GRAY, i, len(self._tests),
Color.RESET, name.ljust(width)))
sys.stdout.write(
tmpl.format(
Color.GRAY, i, len(self._tests), Color.RESET, name.ljust(width)
)
)
sys.stdout.flush()
parent, child = Pipe()
p = Process(target=_runner, args=(text, child))
@@ -156,6 +160,7 @@ def _runner(text, child):
child.send("OK")
child.recv()


if __name__ == "__main__":
setlocale(LC_ALL, "")
MemoryTest().run()

+ 32
- 23
setup.py View File

@@ -52,8 +52,10 @@ elif env_var is not None:

# Remove the command line argument as it isn't understood by setuptools:

sys.argv = [arg for arg in sys.argv
if arg not in ("--without-extension", "--with-extension")]
sys.argv = [
arg for arg in sys.argv if arg not in ("--without-extension", "--with-extension")
]


def build_ext_patched(self):
try:
@@ -63,33 +65,40 @@ def build_ext_patched(self):
print("Falling back to pure Python mode.")
del self.extensions[:]


if fallback:
build_ext.run, build_ext_original = build_ext_patched, build_ext.run

# Project-specific part begins here:

tokenizer = Extension("mwparserfromhell.parser._tokenizer",
sources=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.c")),
depends=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.h")))
tokenizer = Extension(
"mwparserfromhell.parser._tokenizer",
sources=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.c")),
depends=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.h")),
)

setup(
name = "mwparserfromhell",
packages = find_packages("src"),
package_dir = {"": "src"},
ext_modules = [tokenizer] if use_extension else [],
setup_requires = ["pytest-runner"] if "test" in sys.argv or "pytest" in sys.argv else [],
tests_require = ["pytest"],
version = __version__,
python_requires = ">= 3.5",
author = "Ben Kurtovic",
author_email = "ben.kurtovic@gmail.com",
url = "https://github.com/earwig/mwparserfromhell",
description = "MWParserFromHell is a parser for MediaWiki wikicode.",
long_description = long_docs,
download_url = "https://github.com/earwig/mwparserfromhell/tarball/v{}".format(__version__),
keywords = "earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing",
license = "MIT License",
classifiers = [
name="mwparserfromhell",
packages=find_packages("src"),
package_dir={"": "src"},
ext_modules=[tokenizer] if use_extension else [],
setup_requires=["pytest-runner"]
if "test" in sys.argv or "pytest" in sys.argv
else [],
tests_require=["pytest"],
version=__version__,
python_requires=">= 3.5",
author="Ben Kurtovic",
author_email="ben.kurtovic@gmail.com",
url="https://github.com/earwig/mwparserfromhell",
description="MWParserFromHell is a parser for MediaWiki wikicode.",
long_description=long_docs,
download_url="https://github.com/earwig/mwparserfromhell/tarball/v{}".format(
__version__
),
keywords="earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing",
license="MIT License",
classifiers=[
"Development Status :: 4 - Beta",
"Environment :: Console",
"Intended Audience :: Developers",
@@ -101,6 +110,6 @@ setup(
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Text Processing :: Markup"
"Topic :: Text Processing :: Markup",
],
)

+ 1
- 2
src/mwparserfromhell/__init__.py View File

@@ -30,7 +30,6 @@ __license__ = "MIT License"
__version__ = "0.7.dev0"
__email__ = "ben.kurtovic@gmail.com"

from . import (definitions, nodes, parser, smart_list, string_mixin,
utils, wikicode)
from . import definitions, nodes, parser, smart_list, string_mixin, utils, wikicode

parse = utils.parse_anything

+ 16
- 4
src/mwparserfromhell/definitions.py View File

@@ -26,8 +26,14 @@ When updating this file, please also update the the C tokenizer version:
- mwparserfromhell/parser/ctokenizer/definitions.h
"""

__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only", "is_scheme"]
__all__ = [
"get_html_tag",
"is_parsable",
"is_visible",
"is_single",
"is_single_only",
"is_scheme",
]

URI_SCHEMES = {
# [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0
@@ -92,7 +98,7 @@ INVISIBLE_TAGS = [
"score",
"section",
"templatedata",
"timeline"
"timeline",
]

# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645
@@ -103,29 +109,35 @@ MARKUP_TO_HTML = {
"#": "li",
"*": "li",
";": "dt",
":": "dd"
":": "dd",
}


def get_html_tag(markup):
"""Return the HTML tag associated with the given wiki-markup."""
return MARKUP_TO_HTML[markup]


def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser."""
return tag.lower() not in PARSER_BLACKLIST


def is_visible(tag):
"""Return whether or not the given *tag* contains visible text."""
return tag.lower() not in INVISIBLE_TAGS


def is_single(tag):
"""Return whether or not the given *tag* can exist without a close tag."""
return tag.lower() in SINGLE


def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY


def is_scheme(scheme, slashes=True):
"""Return whether *scheme* is valid for external links."""
scheme = scheme.lower()


+ 12
- 2
src/mwparserfromhell/nodes/__init__.py View File

@@ -39,5 +39,15 @@ from .tag import Tag
from .template import Template
from .wikilink import Wikilink

__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading",
"Node", "Tag", "Template", "Text", "Wikilink"]
__all__ = [
"Argument",
"Comment",
"ExternalLink",
"HTMLEntity",
"Heading",
"Node",
"Tag",
"Template",
"Text",
"Wikilink",
]

+ 2
- 0
src/mwparserfromhell/nodes/_base.py View File

@@ -22,6 +22,7 @@ from ..string_mixin import StringMixIn

__all__ = ["Node"]


class Node(StringMixIn):
"""Represents the base Node type, demonstrating the methods to override.

@@ -35,6 +36,7 @@ class Node(StringMixIn):
:meth:`__showtree__` can be overridden to build a nice tree representation
of the node, if desired, for :meth:`~.Wikicode.get_tree`.
"""

def __str__(self):
raise NotImplementedError()



+ 1
- 0
src/mwparserfromhell/nodes/argument.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything

__all__ = ["Argument"]


class Argument(Node):
"""Represents a template argument substitution, like ``{{{foo}}}``."""



+ 1
- 0
src/mwparserfromhell/nodes/comment.py View File

@@ -23,6 +23,7 @@ from ._base import Node

__all__ = ["Comment"]


class Comment(Node):
"""Represents a hidden HTML comment, like ``<!-- foobar -->``."""



+ 2
- 0
src/mwparserfromhell/nodes/external_link.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything

__all__ = ["ExternalLink"]


class ExternalLink(Node):
"""Represents an external link, like ``[http://example.com/ Example]``."""

@@ -83,6 +84,7 @@ class ExternalLink(Node):
def url(self, value):
# pylint: disable=import-outside-toplevel
from ..parser import contexts

self._url = parse_anything(value, contexts.EXT_LINK_URI)

@title.setter


+ 10
- 2
src/mwparserfromhell/nodes/extras/attribute.py View File

@@ -24,6 +24,7 @@ from ...utils import parse_anything

__all__ = ["Attribute"]


class Attribute(StringMixIn):
"""Represents an attribute of an HTML tag.

@@ -32,8 +33,15 @@ class Attribute(StringMixIn):
whose value is ``"foo"``.
"""

def __init__(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
def __init__(
self,
name,
value=None,
quotes='"',
pad_first=" ",
pad_before_eq="",
pad_after_eq="",
):
super().__init__()
self.name = name
self._quotes = None


+ 2
- 2
src/mwparserfromhell/nodes/extras/parameter.py View File

@@ -25,6 +25,7 @@ from ...utils import parse_anything

__all__ = ["Parameter"]


class Parameter(StringMixIn):
"""Represents a paramater of a template.

@@ -77,6 +78,5 @@ class Parameter(StringMixIn):
def showkey(self, newval):
newval = bool(newval)
if not newval and not self.can_hide_key(self.name):
raise ValueError("parameter key {!r} cannot be hidden".format(
self.name))
raise ValueError("parameter key {!r} cannot be hidden".format(self.name))
self._showkey = newval

+ 1
- 0
src/mwparserfromhell/nodes/heading.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything

__all__ = ["Heading"]


class Heading(Node):
"""Represents a section heading in wikicode, like ``== Foo ==``."""



+ 12
- 5
src/mwparserfromhell/nodes/html_entity.py View File

@@ -24,6 +24,7 @@ from ._base import Node

__all__ = ["HTMLEntity"]


class HTMLEntity(Node):
"""Represents an HTML entity, like ``&nbsp;``, either named or unnamed."""

@@ -101,19 +102,23 @@ class HTMLEntity(Node):
except ValueError:
if newval not in htmlentities.entitydefs:
raise ValueError(
"entity value {!r} is not a valid name".format(newval)) from None
"entity value {!r} is not a valid name".format(newval)
) from None
self._named = True
self._hexadecimal = False
else:
if intval < 0 or intval > 0x10FFFF:
raise ValueError(
"entity value 0x{:x} is not in range(0x110000)".format(intval)) from None
"entity value 0x{:x} is not in range(0x110000)".format(intval)
) from None
self._named = False
self._hexadecimal = True
else:
test = int(newval, 16 if self.hexadecimal else 10)
if test < 0 or test > 0x10FFFF:
raise ValueError("entity value {} is not in range(0x110000)".format(test))
raise ValueError(
"entity value {} is not in range(0x110000)".format(test)
)
self._named = False
self._value = newval

@@ -126,8 +131,10 @@ class HTMLEntity(Node):
try:
int(self.value, 16)
except ValueError as exc:
raise ValueError("current entity value {!r} is not a valid "
"Unicode codepoint".format(self.value)) from exc
raise ValueError(
"current entity value {!r} is not a valid "
"Unicode codepoint".format(self.value)
) from exc
self._named = newval

@hexadecimal.setter


+ 32
- 8
src/mwparserfromhell/nodes/tag.py View File

@@ -26,13 +26,24 @@ from ..utils import parse_anything

__all__ = ["Tag"]


class Tag(Node):
"""Represents an HTML-style tag in wikicode, like ``<ref>``."""

def __init__(self, tag, contents=None, attrs=None, wiki_markup=None,
self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None, wiki_style_separator=None,
closing_wiki_markup=None):
def __init__(
self,
tag,
contents=None,
attrs=None,
wiki_markup=None,
self_closing=False,
invalid=False,
implicit=False,
padding="",
closing_tag=None,
wiki_style_separator=None,
closing_wiki_markup=None,
):
super().__init__()
self.tag = tag
self.contents = contents
@@ -60,8 +71,14 @@ class Tag(Node):
if self.self_closing:
return self.wiki_markup + attrs + padding + separator
close = self.closing_wiki_markup or ""
return self.wiki_markup + attrs + padding + separator + \
str(self.contents) + close
return (
self.wiki_markup
+ attrs
+ padding
+ separator
+ str(self.contents)
+ close
)

result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes:
@@ -270,8 +287,15 @@ class Tag(Node):
return attr
raise ValueError(name)

def add(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
def add(
self,
name,
value=None,
quotes='"',
pad_first=" ",
pad_before_eq="",
pad_after_eq="",
):
"""Add an attribute with the given *name* and *value*.

*name* and *value* can be anything parsable by


+ 7
- 6
src/mwparserfromhell/nodes/template.py View File

@@ -33,6 +33,7 @@ FLAGS = re.DOTALL | re.UNICODE
# Used to allow None as a valid fallback value
_UNSET = object()


class Template(Node):
"""Represents a template in wikicode, like ``{{foo}}``."""

@@ -153,7 +154,7 @@ class Template(Node):
def _fix_dependendent_params(self, i):
"""Unhide keys if necessary after removing the param at index *i*."""
if not self.params[i].showkey:
for param in self.params[i + 1:]:
for param in self.params[i + 1 :]:
if not param.showkey:
param.showkey = True

@@ -175,9 +176,10 @@ class Template(Node):
If one exists, we should remove the given one rather than blanking it.
"""
if self.params[i].showkey:
following = self.params[i + 1:]
better_matches = [after.name.strip() == name and not after.showkey
for after in following]
following = self.params[i + 1 :]
better_matches = [
after.name.strip() == name and not after.showkey for after in following
]
return any(better_matches)
return False

@@ -235,8 +237,7 @@ class Template(Node):
def __getitem__(self, name):
return self.get(name)

def add(self, name, value, showkey=None, before=None,
preserve_spacing=True):
def add(self, name, value, showkey=None, before=None, preserve_spacing=True):
"""Add a parameter to the template with a given *name* and *value*.

*name* and *value* can be anything parsable by


+ 1
- 0
src/mwparserfromhell/nodes/text.py View File

@@ -23,6 +23,7 @@ from ._base import Node

__all__ = ["Text"]


class Text(Node):
"""Represents ordinary, unformatted text with no special properties."""



+ 1
- 0
src/mwparserfromhell/nodes/wikilink.py View File

@@ -24,6 +24,7 @@ from ..utils import parse_anything

__all__ = ["Wikilink"]


class Wikilink(Node):
"""Represents an internal wikilink, like ``[[Foo|Bar]]``."""



+ 5
- 0
src/mwparserfromhell/parser/__init__.py View File

@@ -26,16 +26,20 @@ together into one interface.

from .builder import Builder
from .errors import ParserError

try:
from ._tokenizer import CTokenizer

use_c = True
except ImportError:
from .tokenizer import Tokenizer

CTokenizer = None
use_c = False

__all__ = ["use_c", "Parser", "ParserError"]


class Parser:
"""Represents a parser for wikicode.

@@ -57,6 +61,7 @@ class Parser:
self._tokenizer = CTokenizer()
else:
from .tokenizer import Tokenizer

self._tokenizer = Tokenizer()
self._builder = Builder()



+ 57
- 20
src/mwparserfromhell/parser/builder.py View File

@@ -21,24 +21,34 @@

from . import tokens
from .errors import ParserError
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
Template, Text, Wikilink)
from ..nodes import (
Argument,
Comment,
ExternalLink,
Heading,
HTMLEntity,
Tag,
Template,
Text,
Wikilink,
)
from ..nodes.extras import Attribute, Parameter
from ..smart_list import SmartList
from ..wikicode import Wikicode

__all__ = ["Builder"]

_HANDLERS = {
tokens.Text: lambda self, token: Text(token.text)
}
_HANDLERS = {tokens.Text: lambda self, token: Text(token.text)}


def _add_handler(token_type):
"""Create a decorator that adds a handler function to the lookup table."""

def decorator(func):
"""Add a handler function to the lookup table."""
_HANDLERS[token_type] = func
return func

return decorator


@@ -84,8 +94,9 @@ class Builder:
key = self._pop()
showkey = True
self._push()
elif isinstance(token, (tokens.TemplateParamSeparator,
tokens.TemplateClose)):
elif isinstance(
token, (tokens.TemplateParamSeparator, tokens.TemplateClose)
):
self._tokens.append(token)
value = self._pop()
if key is None:
@@ -167,10 +178,17 @@ class Builder:
self._push()
elif isinstance(token, tokens.ExternalLinkClose):
if url is not None:
return ExternalLink(url, self._pop(), brackets=brackets,
suppress_space=suppress_space is True)
return ExternalLink(self._pop(), brackets=brackets,
suppress_space=suppress_space is True)
return ExternalLink(
url,
self._pop(),
brackets=brackets,
suppress_space=suppress_space is True,
)
return ExternalLink(
self._pop(),
brackets=brackets,
suppress_space=suppress_space is True,
)
else:
self._write(self._handle_token(token))
raise ParserError("_handle_external_link() missed a close token")
@@ -184,8 +202,9 @@ class Builder:
if isinstance(token, tokens.HTMLEntityHex):
text = self._tokens.pop()
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(text.text, named=False, hexadecimal=True,
hex_char=token.char)
return HTMLEntity(
text.text, named=False, hexadecimal=True, hex_char=token.char
)
self._tokens.pop() # Remove HTMLEntityEnd
return HTMLEntity(token.text, named=False, hexadecimal=False)
self._tokens.pop() # Remove HTMLEntityEnd
@@ -227,15 +246,23 @@ class Builder:
self._push()
elif isinstance(token, tokens.TagAttrQuote):
quotes = token.char
elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
tokens.TagCloseSelfclose)):
elif isinstance(
token,
(tokens.TagAttrStart, tokens.TagCloseOpen, tokens.TagCloseSelfclose),
):
self._tokens.append(token)
if name:
value = self._pop()
else:
name, value = self._pop(), None
return Attribute(name, value, quotes, start.pad_first,
start.pad_before_eq, start.pad_after_eq)
return Attribute(
name,
value,
quotes,
start.pad_first,
start.pad_before_eq,
start.pad_after_eq,
)
else:
self._write(self._handle_token(token))
raise ParserError("_handle_attribute() missed a close token")
@@ -271,9 +298,19 @@ class Builder:
else:
self_closing = False
closing_tag = self._pop()
return Tag(tag, contents, attrs, wiki_markup, self_closing,
invalid, implicit, padding, closing_tag,
wiki_style_separator, closing_wiki_markup)
return Tag(
tag,
contents,
attrs,
wiki_markup,
self_closing,
invalid,
implicit,
padding,
closing_tag,
wiki_style_separator,
closing_wiki_markup,
)
else:
self._write(self._handle_token(token))
raise ParserError("_handle_tag() missed a close token")


+ 59
- 34
src/mwparserfromhell/parser/contexts.py View File

@@ -116,21 +116,21 @@ Aggregate contexts:

# Local contexts:

TEMPLATE_NAME = 1 << 0
TEMPLATE_PARAM_KEY = 1 << 1
TEMPLATE_NAME = 1 << 0
TEMPLATE_PARAM_KEY = 1 << 1
TEMPLATE_PARAM_VALUE = 1 << 2
TEMPLATE = TEMPLATE_NAME + TEMPLATE_PARAM_KEY + TEMPLATE_PARAM_VALUE

ARGUMENT_NAME = 1 << 3
ARGUMENT_NAME = 1 << 3
ARGUMENT_DEFAULT = 1 << 4
ARGUMENT = ARGUMENT_NAME + ARGUMENT_DEFAULT

WIKILINK_TITLE = 1 << 5
WIKILINK_TEXT = 1 << 6
WIKILINK_TEXT = 1 << 6
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT

EXT_LINK_URI = 1 << 7
EXT_LINK_TITLE = 1 << 8
EXT_LINK_URI = 1 << 7
EXT_LINK_TITLE = 1 << 8
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE

HEADING_LEVEL_1 = 1 << 9
@@ -139,42 +139,61 @@ HEADING_LEVEL_3 = 1 << 11
HEADING_LEVEL_4 = 1 << 12
HEADING_LEVEL_5 = 1 << 13
HEADING_LEVEL_6 = 1 << 14
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 +
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6)

TAG_OPEN = 1 << 15
TAG_ATTR = 1 << 16
TAG_BODY = 1 << 17
HEADING = (
HEADING_LEVEL_1
+ HEADING_LEVEL_2
+ HEADING_LEVEL_3
+ HEADING_LEVEL_4
+ HEADING_LEVEL_5
+ HEADING_LEVEL_6
)

TAG_OPEN = 1 << 15
TAG_ATTR = 1 << 16
TAG_BODY = 1 << 17
TAG_CLOSE = 1 << 18
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE

STYLE_ITALICS = 1 << 19
STYLE_BOLD = 1 << 20
STYLE_PASS_AGAIN = 1 << 21
STYLE_SECOND_PASS = 1 << 22
STYLE_ITALICS = 1 << 19
STYLE_BOLD = 1 << 20
STYLE_PASS_AGAIN = 1 << 21
STYLE_SECOND_PASS = 1 << 22
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS

DL_TERM = 1 << 23

HAS_TEXT = 1 << 24
FAIL_ON_TEXT = 1 << 25
FAIL_NEXT = 1 << 26
HAS_TEXT = 1 << 24
FAIL_ON_TEXT = 1 << 25
FAIL_NEXT = 1 << 26
FAIL_ON_LBRACE = 1 << 27
FAIL_ON_RBRACE = 1 << 28
FAIL_ON_EQUALS = 1 << 29
HAS_TEMPLATE = 1 << 30
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE)

TABLE_OPEN = 1 << 31
TABLE_CELL_OPEN = 1 << 32
HAS_TEMPLATE = 1 << 30
SAFETY_CHECK = (
HAS_TEXT
+ FAIL_ON_TEXT
+ FAIL_NEXT
+ FAIL_ON_LBRACE
+ FAIL_ON_RBRACE
+ FAIL_ON_EQUALS
+ HAS_TEMPLATE
)

TABLE_OPEN = 1 << 31
TABLE_CELL_OPEN = 1 << 32
TABLE_CELL_STYLE = 1 << 33
TABLE_ROW_OPEN = 1 << 34
TABLE_TD_LINE = 1 << 35
TABLE_TH_LINE = 1 << 36
TABLE_ROW_OPEN = 1 << 34
TABLE_TD_LINE = 1 << 35
TABLE_TH_LINE = 1 << 36
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
TABLE_TD_LINE + TABLE_TH_LINE)
TABLE = (
TABLE_OPEN
+ TABLE_CELL_OPEN
+ TABLE_CELL_STYLE
+ TABLE_ROW_OPEN
+ TABLE_TD_LINE
+ TABLE_TH_LINE
)

HTML_ENTITY = 1 << 37

@@ -184,14 +203,20 @@ GL_HEADING = 1 << 0

# Aggregate contexts:

FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG +
STYLE + TABLE)
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE
UNSAFE = (
TEMPLATE_NAME
+ WIKILINK_TITLE
+ EXT_LINK_TITLE
+ TEMPLATE_PARAM_KEY
+ ARGUMENT_NAME
+ TAG_CLOSE
)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK


def describe(context):
"""Return a string describing the given context value, for debugging."""
flags = []


+ 525
- 510
src/mwparserfromhell/parser/ctokenizer/avl_tree.c
File diff suppressed because it is too large
View File


+ 166
- 177
src/mwparserfromhell/parser/ctokenizer/avl_tree.h View File

@@ -1,6 +1,6 @@
/*
* avl_tree.h - intrusive, nonrecursive AVL tree data structure (self-balancing
* binary search tree), header file
* binary search tree), header file
*
* Written in 2014-2016 by Eric Biggers <ebiggers3@gmail.com>
* Slight changes for compatibility by Ben Kurtovic <ben.kurtovic@gmail.com>
@@ -24,60 +24,60 @@
#include <stddef.h>

#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
#include <stdint.h>
# include <stdint.h>
#endif

#ifdef __GNUC__
# define AVL_INLINE inline __attribute__((always_inline))
# define AVL_INLINE inline __attribute__((always_inline))
#elif defined(_MSC_VER) && (_MSC_VER < 1900)
# define AVL_INLINE __inline
# define AVL_INLINE __inline
#else
# define AVL_INLINE inline
# define AVL_INLINE inline
#endif

/* Node in an AVL tree. Embed this in some other data structure. */
struct avl_tree_node {

/* Pointer to left child or NULL */
struct avl_tree_node *left;
/* Pointer to left child or NULL */
struct avl_tree_node *left;

/* Pointer to right child or NULL */
struct avl_tree_node *right;
/* Pointer to right child or NULL */
struct avl_tree_node *right;

/* Pointer to parent combined with the balance factor. This saves 4 or
* 8 bytes of memory depending on the CPU architecture.
*
* Low 2 bits: One greater than the balance factor of this subtree,
* which is equal to height(right) - height(left). The mapping is:
*
* 00 => -1
* 01 => 0
* 10 => +1
* 11 => undefined
*
* The rest of the bits are the pointer to the parent node. It must be
* 4-byte aligned, and it will be NULL if this is the root node and
* therefore has no parent. */
uintptr_t parent_balance;
/* Pointer to parent combined with the balance factor. This saves 4 or
* 8 bytes of memory depending on the CPU architecture.
*
* Low 2 bits: One greater than the balance factor of this subtree,
* which is equal to height(right) - height(left). The mapping is:
*
* 00 => -1
* 01 => 0
* 10 => +1
* 11 => undefined
*
* The rest of the bits are the pointer to the parent node. It must be
* 4-byte aligned, and it will be NULL if this is the root node and
* therefore has no parent. */
uintptr_t parent_balance;
};

/* Cast an AVL tree node to the containing data structure. */
#define avl_tree_entry(entry, type, member) \
((type*) ((char *)(entry) - offsetof(type, member)))
#define avl_tree_entry(entry, type, member) \
((type *) ((char *) (entry) -offsetof(type, member)))

/* Returns a pointer to the parent of the specified AVL tree node, or NULL if it
* is already the root of the tree. */
static AVL_INLINE struct avl_tree_node *
avl_get_parent(const struct avl_tree_node *node)
{
return (struct avl_tree_node *)(node->parent_balance & ~3);
return (struct avl_tree_node *) (node->parent_balance & ~3);
}

/* Marks the specified AVL tree node as unlinked from any tree. */
static AVL_INLINE void
avl_tree_node_set_unlinked(struct avl_tree_node *node)
{
node->parent_balance = (uintptr_t)node;
node->parent_balance = (uintptr_t) node;
}

/* Returns true iff the specified AVL tree node has been marked with
@@ -86,30 +86,29 @@ avl_tree_node_set_unlinked(struct avl_tree_node *node)
static AVL_INLINE int
avl_tree_node_is_unlinked(const struct avl_tree_node *node)
{
return node->parent_balance == (uintptr_t)node;
return node->parent_balance == (uintptr_t) node;
}

/* (Internal use only) */
extern void
avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *inserted);
extern void avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *inserted);

/*
* Looks up an item in the specified AVL tree.
*
* @root
* Pointer to the root of the AVL tree. (This can be NULL --- that just
* means the tree is empty.)
* Pointer to the root of the AVL tree. (This can be NULL --- that just
* means the tree is empty.)
*
* @cmp_ctx
* First argument to pass to the comparison callback. This generally
* should be a pointer to an object equal to the one being searched for.
* First argument to pass to the comparison callback. This generally
* should be a pointer to an object equal to the one being searched for.
*
* @cmp
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @cmp_ctx and the second
* argument will be a pointer to the AVL tree node of an item in the tree.
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @cmp_ctx and the second
* argument will be a pointer to the AVL tree node of an item in the tree.
*
* Returns a pointer to the AVL tree node of the resulting item, or NULL if the
* item was not found.
@@ -117,48 +116,49 @@ avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr,
* Example:
*
* struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* };
*
* static int _avl_cmp_int_to_node(const void *intptr,
* const struct avl_tree_node *nodeptr)
* const struct avl_tree_node *nodeptr)
* {
* int n1 = *(const int *)intptr;
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data;
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* int n1 = *(const int *)intptr;
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data;
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* }
*
* bool contains_int(struct avl_tree_node *root, int n)
* {
* struct avl_tree_node *result;
* struct avl_tree_node *result;
*
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node);
* return result ? true : false;
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node);
* return result ? true : false;
* }
*/
static AVL_INLINE struct avl_tree_node *
avl_tree_lookup(const struct avl_tree_node *root,
const void *cmp_ctx,
int (*cmp)(const void *, const struct avl_tree_node *))
const void *cmp_ctx,
int (*cmp)(const void *, const struct avl_tree_node *))
{
const struct avl_tree_node *cur = root;
const struct avl_tree_node *cur = root;

while (cur) {
int res = (*cmp)(cmp_ctx, cur);
if (res < 0)
cur = cur->left;
else if (res > 0)
cur = cur->right;
else
break;
}
return (struct avl_tree_node*)cur;
while (cur) {
int res = (*cmp)(cmp_ctx, cur);
if (res < 0) {
cur = cur->left;
} else if (res > 0) {
cur = cur->right;
} else {
break;
}
}
return (struct avl_tree_node *) cur;
}

/* Same as avl_tree_lookup(), but uses a more specific type for the comparison
@@ -167,44 +167,45 @@ avl_tree_lookup(const struct avl_tree_node *root,
* embedded 'struct avl_tree_node'. */
static AVL_INLINE struct avl_tree_node *
avl_tree_lookup_node(const struct avl_tree_node *root,
const struct avl_tree_node *node,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
const struct avl_tree_node *node,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
{
const struct avl_tree_node *cur = root;
const struct avl_tree_node *cur = root;

while (cur) {
int res = (*cmp)(node, cur);
if (res < 0)
cur = cur->left;
else if (res > 0)
cur = cur->right;
else
break;
}
return (struct avl_tree_node*)cur;
while (cur) {
int res = (*cmp)(node, cur);
if (res < 0) {
cur = cur->left;
} else if (res > 0) {
cur = cur->right;
} else {
break;
}
}
return (struct avl_tree_node *) cur;
}

/*
* Inserts an item into the specified AVL tree.
*
* @root_ptr
* Location of the AVL tree's root pointer. Indirection is needed because
* the root node may change as a result of rotations caused by the
* insertion. Initialize *root_ptr to NULL for an empty tree.
* Location of the AVL tree's root pointer. Indirection is needed because
* the root node may change as a result of rotations caused by the
* insertion. Initialize *root_ptr to NULL for an empty tree.
*
* @item
* Pointer to the `struct avl_tree_node' embedded in the item to insert.
* No members in it need be pre-initialized, although members in the
* containing structure should be pre-initialized so that @cmp can use them
* in comparisons.
* Pointer to the `struct avl_tree_node' embedded in the item to insert.
* No members in it need be pre-initialized, although members in the
* containing structure should be pre-initialized so that @cmp can use them
* in comparisons.
*
* @cmp
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @item and the second
* argument will be a pointer to an AVL tree node embedded in some
* previously-inserted item to which @item is being compared.
* Comparison callback. Must return < 0, 0, or > 0 if the first argument
* is less than, equal to, or greater than the second argument,
* respectively. The first argument will be @item and the second
* argument will be a pointer to an AVL tree node embedded in some
* previously-inserted item to which @item is being compared.
*
* If no item in the tree is comparatively equal (via @cmp) to @item, inserts
* @item and returns NULL. Otherwise does nothing and returns a pointer to the
@@ -214,150 +215,138 @@ avl_tree_lookup_node(const struct avl_tree_node *root,
* Example:
*
* struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* };
*
* #define GET_DATA(i) avl_tree_entry((i), struct int_wrapper, index_node)->data
*
* static int _avl_cmp_ints(const struct avl_tree_node *node1,
* const struct avl_tree_node *node2)
* const struct avl_tree_node *node2)
* {
* int n1 = GET_DATA(node1);
* int n2 = GET_DATA(node2);
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* int n1 = GET_DATA(node1);
* int n2 = GET_DATA(node2);
* if (n1 < n2)
* return -1;
* else if (n1 > n2)
* return 1;
* else
* return 0;
* }
*
* bool insert_int(struct avl_tree_node **root_ptr, int data)
* {
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper));
* i->data = data;
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) {
* // Duplicate.
* free(i);
* return false;
* }
* return true;
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper));
* i->data = data;
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) {
* // Duplicate.
* free(i);
* return false;
* }
* return true;
* }
*/
static AVL_INLINE struct avl_tree_node *
avl_tree_insert(struct avl_tree_node **root_ptr,
struct avl_tree_node *item,
int (*cmp)(const struct avl_tree_node *,
const struct avl_tree_node *))
struct avl_tree_node *item,
int (*cmp)(const struct avl_tree_node *, const struct avl_tree_node *))
{
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL;
int res;
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL;
int res;

while (*cur_ptr) {
cur = *cur_ptr;
res = (*cmp)(item, cur);
if (res < 0)
cur_ptr = &cur->left;
else if (res > 0)
cur_ptr = &cur->right;
else
return cur;
}
*cur_ptr = item;
item->parent_balance = (uintptr_t)cur | 1;
avl_tree_rebalance_after_insert(root_ptr, item);
return NULL;
while (*cur_ptr) {
cur = *cur_ptr;
res = (*cmp)(item, cur);
if (res < 0) {
cur_ptr = &cur->left;
} else if (res > 0) {
cur_ptr = &cur->right;
} else {
return cur;
}
}
*cur_ptr = item;
item->parent_balance = (uintptr_t) cur | 1;
avl_tree_rebalance_after_insert(root_ptr, item);
return NULL;
}

/* Removes an item from the specified AVL tree.
* See implementation for details. */
extern void
avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node);
extern void avl_tree_remove(struct avl_tree_node **root_ptr,
struct avl_tree_node *node);

/* Nonrecursive AVL tree traversal functions */

extern struct avl_tree_node *
avl_tree_first_in_order(const struct avl_tree_node *root);
extern struct avl_tree_node *avl_tree_first_in_order(const struct avl_tree_node *root);

extern struct avl_tree_node *
avl_tree_last_in_order(const struct avl_tree_node *root);
extern struct avl_tree_node *avl_tree_last_in_order(const struct avl_tree_node *root);

extern struct avl_tree_node *
avl_tree_next_in_order(const struct avl_tree_node *node);
extern struct avl_tree_node *avl_tree_next_in_order(const struct avl_tree_node *node);

extern struct avl_tree_node *
avl_tree_prev_in_order(const struct avl_tree_node *node);
extern struct avl_tree_node *avl_tree_prev_in_order(const struct avl_tree_node *node);

extern struct avl_tree_node *
avl_tree_first_in_postorder(const struct avl_tree_node *root);

extern struct avl_tree_node *
avl_tree_next_in_postorder(const struct avl_tree_node *prev,
const struct avl_tree_node *prev_parent);
const struct avl_tree_node *prev_parent);

/*
* Iterate through the nodes in an AVL tree in sorted order.
* You may not modify the tree during the iteration.
*
* @child_struct
* Variable that will receive a pointer to each struct inserted into the
* tree.
* Variable that will receive a pointer to each struct inserted into the
* tree.
* @root
* Root of the AVL tree.
* Root of the AVL tree.
* @struct_name
* Type of *child_struct.
* Type of *child_struct.
* @struct_member
* Member of @struct_name type that is the AVL tree node.
* Member of @struct_name type that is the AVL tree node.
*
* Example:
*
* struct int_wrapper {
* int data;
* struct avl_tree_node index_node;
* int data;
* struct avl_tree_node index_node;
* };
*
* void print_ints(struct avl_tree_node *root)
* {
* struct int_wrapper *i;
* struct int_wrapper *i;
*
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node)
* printf("%d\n", i->data);
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node)
* printf("%d\n", i->data);
* }
*/
#define avl_tree_for_each_in_order(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_first_in_order(root); \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1); \
_cur = avl_tree_next_in_order(_cur))
#define avl_tree_for_each_in_order(child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_first_in_order(root); \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1); \
_cur = avl_tree_next_in_order(_cur))

/*
* Like avl_tree_for_each_in_order(), but uses the reverse order.
*/
#define avl_tree_for_each_in_reverse_order(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_last_in_order(root); \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1); \
_cur = avl_tree_prev_in_order(_cur))
#define avl_tree_for_each_in_reverse_order( \
child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_last_in_order(root); \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1); \
_cur = avl_tree_prev_in_order(_cur))

/*
* Like avl_tree_for_each_in_order(), but iterates through the nodes in
* postorder, so the current node may be deleted or freed.
*/
#define avl_tree_for_each_in_postorder(child_struct, root, \
struct_name, struct_member) \
for (struct avl_tree_node *_cur = \
avl_tree_first_in_postorder(root), *_parent; \
_cur && ((child_struct) = \
avl_tree_entry(_cur, struct_name, \
struct_member), 1) \
&& (_parent = avl_get_parent(_cur), 1); \
_cur = avl_tree_next_in_postorder(_cur, _parent))
#define avl_tree_for_each_in_postorder(child_struct, root, struct_name, struct_member) \
for (struct avl_tree_node *_cur = avl_tree_first_in_postorder(root), *_parent; \
_cur && \
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1) && \
(_parent = avl_get_parent(_cur), 1); \
_cur = avl_tree_next_in_postorder(_cur, _parent))

#endif /* _AVL_TREE_H_ */

+ 34
- 33
src/mwparserfromhell/parser/ctokenizer/common.h View File

@@ -23,55 +23,56 @@ SOFTWARE.
#pragma once

#ifndef PY_SSIZE_T_CLEAN
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html
# define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html
#endif

#include <Python.h>
#include <structmember.h>
#include <bytesobject.h>
#include <structmember.h>

#include "avl_tree.h"

/* Compatibility macros */

#ifndef uint64_t
#define uint64_t unsigned PY_LONG_LONG
# define uint64_t unsigned PY_LONG_LONG
#endif

#define malloc PyObject_Malloc // XXX: yuck
#define malloc PyObject_Malloc // XXX: yuck
#define realloc PyObject_Realloc
#define free PyObject_Free

/* Unicode support macros */

#define PyUnicode_FROM_SINGLE(chr) \
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)

/* Error handling macros */

#define BAD_ROUTE self->route_state
#define BAD_ROUTE_CONTEXT self->route_context
#define FAIL_ROUTE(context) { \
self->route_state = 1; \
self->route_context = context; \
}
#define RESET_ROUTE() self->route_state = 0
#define BAD_ROUTE self->route_state
#define BAD_ROUTE_CONTEXT self->route_context
#define FAIL_ROUTE(context) \
do { \
self->route_state = 1; \
self->route_context = context; \
} while (0)
#define RESET_ROUTE() self->route_state = 0

/* Shared globals */

extern char** entitydefs;
extern char **entitydefs;

extern PyObject* NOARGS;
extern PyObject* definitions;
extern PyObject *NOARGS;
extern PyObject *definitions;

/* Structs */

typedef struct {
Py_ssize_t capacity;
Py_ssize_t length;
PyObject* object;
PyObject *object;
int kind;
void* data;
void *data;
} Textbuffer;

typedef struct {
@@ -80,19 +81,19 @@ typedef struct {
} StackIdent;

struct Stack {
PyObject* stack;
PyObject *stack;
uint64_t context;
Textbuffer* textbuffer;
Textbuffer *textbuffer;
StackIdent ident;
struct Stack* next;
struct Stack *next;
};
typedef struct Stack Stack;

typedef struct {
PyObject* object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */
int kind; /* object's kind value */
void* data; /* object's raw unicode buffer */
PyObject *object; /* base PyUnicodeObject object */
Py_ssize_t length; /* length of object, in code points */
int kind; /* object's kind value */
void *data; /* object's raw unicode buffer */
} TokenizerInput;

typedef struct avl_tree_node avl_tree;
@@ -104,13 +105,13 @@ typedef struct {

typedef struct {
PyObject_HEAD
TokenizerInput text; /* text to tokenize */
Stack* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
int global; /* global context */
int depth; /* stack recursion depth */
int route_state; /* whether a BadRoute has been triggered */
uint64_t route_context; /* context when the last BadRoute was triggered */
avl_tree* bad_routes; /* stack idents for routes known to fail */
int skip_style_tags; /* temp fix for the sometimes broken tag parser */
TokenizerInput text; /* text to tokenize */
Stack *topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
int global; /* global context */
int depth; /* stack recursion depth */
int route_state; /* whether a BadRoute has been triggered */
uint64_t route_context; /* context when the last BadRoute was triggered */
avl_tree *bad_routes; /* stack idents for routes known to fail */
int skip_style_tags; /* temp fix for the sometimes broken tag parser */
} Tokenizer;

+ 11
- 5
src/mwparserfromhell/parser/ctokenizer/contexts.h View File

@@ -89,11 +89,17 @@ SOFTWARE.

/* Aggregate contexts */

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)
#define AGG_FAIL \
(LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | \
LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE \
(LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | \
LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS \
(LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS \
(LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

/* Tag contexts */



+ 31
- 23
src/mwparserfromhell/parser/ctokenizer/definitions.c View File

@@ -27,7 +27,8 @@ SOFTWARE.
See the Python version for data sources.
*/

static const char* URI_SCHEMES[] = {
// clang-format off
static const char *URI_SCHEMES[] = {
"bitcoin",
"ftp",
"ftps",
@@ -55,10 +56,10 @@ static const char* URI_SCHEMES[] = {
"urn",
"worldwind",
"xmpp",
NULL,
NULL,
};

static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
static const char *URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
"bitcoin",
"geo",
"magnet",
@@ -73,7 +74,7 @@ static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
NULL,
};

static const char* PARSER_BLACKLIST[] = {
static const char *PARSER_BLACKLIST[] = {
"categorytree",
"ce",
"chem",
@@ -93,32 +94,32 @@ static const char* PARSER_BLACKLIST[] = {
"timeline",
NULL,
};
// clang-format on

static const char* SINGLE[] = {
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td",
"tr", NULL
};
static const char *SINGLE[] = {
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL};

static const char* SINGLE_ONLY[] = {
"br", "wbr", "hr", "meta", "link", "img", NULL
};
static const char *SINGLE_ONLY[] = {"br", "wbr", "hr", "meta", "link", "img", NULL};

/*
Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in
the second argument. The caller must free the return value when finished.
If the return value is NULL, the conversion failed and *string is not set.
*/
static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string)
static PyObject *
unicode_to_lcase_ascii(PyObject *input, const char **string)
{
PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes;

if (!lower)
if (!lower) {
return NULL;
}
bytes = PyUnicode_AsASCIIString(lower);
Py_DECREF(lower);
if (!bytes) {
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
PyErr_Clear();
}
return NULL;
}
*string = PyBytes_AS_STRING(bytes);
@@ -128,14 +129,16 @@ static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string)
/*
Return whether a PyUnicodeObject is in a list of lowercase ASCII strings.
*/
static int unicode_in_string_list(PyObject *input, const char **list)
static int
unicode_in_string_list(PyObject *input, const char **list)
{
const char *string;
PyObject *temp = unicode_to_lcase_ascii(input, &string);
int retval = 0;

if (!temp)
if (!temp) {
return 0;
}

while (*list) {
if (!strcmp(*(list++), string)) {
@@ -144,7 +147,7 @@ static int unicode_in_string_list(PyObject *input, const char **list)
}
}

end:
end:
Py_DECREF(temp);
return retval;
}
@@ -152,7 +155,8 @@ static int unicode_in_string_list(PyObject *input, const char **list)
/*
Return if the given tag's contents should be passed to the parser.
*/
int is_parsable(PyObject *tag)
int
is_parsable(PyObject *tag)
{
return !unicode_in_string_list(tag, PARSER_BLACKLIST);
}
@@ -160,7 +164,8 @@ int is_parsable(PyObject *tag)
/*
Return whether or not the given tag can exist without a close tag.
*/
int is_single(PyObject *tag)
int
is_single(PyObject *tag)
{
return unicode_in_string_list(tag, SINGLE);
}
@@ -168,7 +173,8 @@ int is_single(PyObject *tag)
/*
Return whether or not the given tag must exist without a close tag.
*/
int is_single_only(PyObject *tag)
int
is_single_only(PyObject *tag)
{
return unicode_in_string_list(tag, SINGLE_ONLY);
}
@@ -176,10 +182,12 @@ int is_single_only(PyObject *tag)
/*
Return whether the given scheme is valid for external links.
*/
int is_scheme(PyObject *scheme, int slashes)
int
is_scheme(PyObject *scheme, int slashes)
{
if (slashes)
if (slashes) {
return unicode_in_string_list(scheme, URI_SCHEMES);
else
} else {
return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL);
}
}

+ 5
- 6
src/mwparserfromhell/parser/ctokenizer/definitions.h View File

@@ -28,12 +28,11 @@ SOFTWARE.

/* Functions */

int is_parsable(PyObject*);
int is_single(PyObject*);
int is_single_only(PyObject*);
int is_scheme(PyObject*, int);
int is_parsable(PyObject *);
int is_single(PyObject *);
int is_single_only(PyObject *);
int is_scheme(PyObject *, int);

/* Macros */

#define GET_HTML_TAG(markup) \
(markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")

+ 20
- 14
src/mwparserfromhell/parser/ctokenizer/tag_data.c View File

@@ -26,13 +26,14 @@ SOFTWARE.
/*
Initialize a new TagData object.
*/
TagData* TagData_new(TokenizerInput* text)
TagData *
TagData_new(TokenizerInput *text)
{
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
}

TagData *self = malloc(sizeof(TagData));
@@ -54,25 +55,30 @@ TagData* TagData_new(TokenizerInput* text)
/*
Deallocate the given TagData object.
*/
void TagData_dealloc(TagData* self)
void
TagData_dealloc(TagData *self)
{
if (self->pad_first)
if (self->pad_first) {
Textbuffer_dealloc(self->pad_first);
if (self->pad_before_eq)
}
if (self->pad_before_eq) {
Textbuffer_dealloc(self->pad_before_eq);
if (self->pad_after_eq)
}
if (self->pad_after_eq) {
Textbuffer_dealloc(self->pad_after_eq);
}
free(self);
}

/*
Clear the internal buffers of the given TagData object.
*/
int TagData_reset_buffers(TagData* self)
int
TagData_reset_buffers(TagData *self)
{
if (Textbuffer_reset(self->pad_first) ||
Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq))
if (Textbuffer_reset(self->pad_first) || Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq)) {
return -1;
}
return 0;
}

+ 6
- 6
src/mwparserfromhell/parser/ctokenizer/tag_data.h View File

@@ -29,15 +29,15 @@ SOFTWARE.

typedef struct {
uint64_t context;
Textbuffer* pad_first;
Textbuffer* pad_before_eq;
Textbuffer* pad_after_eq;
Textbuffer *pad_first;
Textbuffer *pad_before_eq;
Textbuffer *pad_after_eq;
Py_UCS4 quoter;
Py_ssize_t reset;
} TagData;

/* Functions */

TagData* TagData_new(TokenizerInput*);
void TagData_dealloc(TagData*);
int TagData_reset_buffers(TagData*);
TagData *TagData_new(TokenizerInput *);
void TagData_dealloc(TagData *);
int TagData_reset_buffers(TagData *);

+ 45
- 26
src/mwparserfromhell/parser/ctokenizer/textbuffer.c View File

@@ -23,20 +23,22 @@ SOFTWARE.
#include "textbuffer.h"

#define INITIAL_CAPACITY 32
#define RESIZE_FACTOR 2
#define CONCAT_EXTRA 32
#define RESIZE_FACTOR 2
#define CONCAT_EXTRA 32

/*
Internal allocation function for textbuffers.
*/
static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar)
static int
internal_alloc(Textbuffer *self, Py_UCS4 maxchar)
{
self->capacity = INITIAL_CAPACITY;
self->length = 0;

self->object = PyUnicode_New(self->capacity, maxchar);
if (!self->object)
if (!self->object) {
return -1;
}
self->kind = PyUnicode_KIND(self->object);
self->data = PyUnicode_DATA(self->object);

@@ -46,7 +48,8 @@ static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar)
/*
Internal deallocation function for textbuffers.
*/
static void internal_dealloc(Textbuffer* self)
static void
internal_dealloc(Textbuffer *self)
{
Py_DECREF(self->object);
}
@@ -54,14 +57,16 @@ static void internal_dealloc(Textbuffer* self)
/*
Internal resize function.
*/
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
static int
internal_resize(Textbuffer *self, Py_ssize_t new_cap)
{
PyObject *newobj;
void *newdata;

newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
if (!newobj)
if (!newobj) {
return -1;
}
newdata = PyUnicode_DATA(newobj);
memcpy(newdata, self->data, self->length * self->kind);
Py_DECREF(self->object);
@@ -75,22 +80,25 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
/*
Create a new textbuffer object.
*/
Textbuffer* Textbuffer_new(TokenizerInput* text)
Textbuffer *
Textbuffer_new(TokenizerInput *text)
{
Textbuffer* self = malloc(sizeof(Textbuffer));
Textbuffer *self = malloc(sizeof(Textbuffer));
Py_UCS4 maxchar = 0;

maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);

if (!self)
if (!self) {
goto fail_nomem;
if (internal_alloc(self, maxchar) < 0)
}
if (internal_alloc(self, maxchar) < 0) {
goto fail_dealloc;
}
return self;

fail_dealloc:
fail_dealloc:
free(self);
fail_nomem:
fail_nomem:
PyErr_NoMemory();
return NULL;
}
@@ -98,7 +106,8 @@ Textbuffer* Textbuffer_new(TokenizerInput* text)
/*
Deallocate the given textbuffer.
*/
void Textbuffer_dealloc(Textbuffer* self)
void
Textbuffer_dealloc(Textbuffer *self)
{
internal_dealloc(self);
free(self);
@@ -107,26 +116,30 @@ void Textbuffer_dealloc(Textbuffer* self)
/*
Reset a textbuffer to its initial, empty state.
*/
int Textbuffer_reset(Textbuffer* self)
int
Textbuffer_reset(Textbuffer *self)
{
Py_UCS4 maxchar = 0;

maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);

internal_dealloc(self);
if (internal_alloc(self, maxchar))
if (internal_alloc(self, maxchar)) {
return -1;
}
return 0;
}

/*
Write a Unicode codepoint to the given textbuffer.