@@ -0,0 +1,13 @@ | |||
BasedOnStyle: LLVM | |||
AlignConsecutiveMacros: AcrossEmptyLines | |||
AllowShortFunctionsOnASingleLine: Inline | |||
AlwaysBreakAfterReturnType: TopLevelDefinitions | |||
BinPackArguments: false | |||
BinPackParameters: false | |||
BreakBeforeBraces: Linux | |||
ColumnLimit: 88 | |||
IndentPPDirectives: AfterHash | |||
IndentWidth: 4 | |||
SpaceAfterCStyleCast: true | |||
StatementMacros: | |||
- PyObject_HEAD |
@@ -13,5 +13,6 @@ dist | |||
docs/_build | |||
scripts/*.log | |||
htmlcov/ | |||
compile_commands.json | |||
.idea/ | |||
.pytest_cache/ |
@@ -41,6 +41,7 @@ from mwparserfromhell.parser._tokenizer import CTokenizer | |||
LOOPS = 10000 | |||
class Color: | |||
GRAY = "\x1b[30;1m" | |||
GREEN = "\x1b[92m" | |||
@@ -63,11 +64,11 @@ class MemoryTest: | |||
data = {"name": None, "label": None, "input": None, "output": None} | |||
for line in test.strip().splitlines(): | |||
if line.startswith("name:"): | |||
data["name"] = line[len("name:"):].strip() | |||
data["name"] = line[len("name:") :].strip() | |||
elif line.startswith("label:"): | |||
data["label"] = line[len("label:"):].strip() | |||
data["label"] = line[len("label:") :].strip() | |||
elif line.startswith("input:"): | |||
raw = line[len("input:"):].strip() | |||
raw = line[len("input:") :].strip() | |||
if raw[0] == '"' and raw[-1] == '"': | |||
raw = raw[1:-1] | |||
raw = raw.encode("raw_unicode_escape") | |||
@@ -81,7 +82,7 @@ class MemoryTest: | |||
def load_file(filename): | |||
with open(filename, "rU") as fp: | |||
text = fp.read() | |||
name = path.split(filename)[1][:0-len(extension)] | |||
name = path.split(filename)[1][: 0 - len(extension)] | |||
self._parse_file(name, text) | |||
root = path.split(path.dirname(path.abspath(__file__)))[0] | |||
@@ -119,8 +120,11 @@ class MemoryTest: | |||
tmpl = "{0}[{1:03}/{2}]{3} {4}: " | |||
for i, (name, text) in enumerate(self._tests, 1): | |||
sys.stdout.write(tmpl.format(Color.GRAY, i, len(self._tests), | |||
Color.RESET, name.ljust(width))) | |||
sys.stdout.write( | |||
tmpl.format( | |||
Color.GRAY, i, len(self._tests), Color.RESET, name.ljust(width) | |||
) | |||
) | |||
sys.stdout.flush() | |||
parent, child = Pipe() | |||
p = Process(target=_runner, args=(text, child)) | |||
@@ -156,6 +160,7 @@ def _runner(text, child): | |||
child.send("OK") | |||
child.recv() | |||
if __name__ == "__main__": | |||
setlocale(LC_ALL, "") | |||
MemoryTest().run() |
@@ -52,8 +52,10 @@ elif env_var is not None: | |||
# Remove the command line argument as it isn't understood by setuptools: | |||
sys.argv = [arg for arg in sys.argv | |||
if arg not in ("--without-extension", "--with-extension")] | |||
sys.argv = [ | |||
arg for arg in sys.argv if arg not in ("--without-extension", "--with-extension") | |||
] | |||
def build_ext_patched(self): | |||
try: | |||
@@ -63,33 +65,40 @@ def build_ext_patched(self): | |||
print("Falling back to pure Python mode.") | |||
del self.extensions[:] | |||
if fallback: | |||
build_ext.run, build_ext_original = build_ext_patched, build_ext.run | |||
# Project-specific part begins here: | |||
tokenizer = Extension("mwparserfromhell.parser._tokenizer", | |||
sources=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.c")), | |||
depends=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.h"))) | |||
tokenizer = Extension( | |||
"mwparserfromhell.parser._tokenizer", | |||
sources=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.c")), | |||
depends=sorted(glob("src/mwparserfromhell/parser/ctokenizer/*.h")), | |||
) | |||
setup( | |||
name = "mwparserfromhell", | |||
packages = find_packages("src"), | |||
package_dir = {"": "src"}, | |||
ext_modules = [tokenizer] if use_extension else [], | |||
setup_requires = ["pytest-runner"] if "test" in sys.argv or "pytest" in sys.argv else [], | |||
tests_require = ["pytest"], | |||
version = __version__, | |||
python_requires = ">= 3.5", | |||
author = "Ben Kurtovic", | |||
author_email = "ben.kurtovic@gmail.com", | |||
url = "https://github.com/earwig/mwparserfromhell", | |||
description = "MWParserFromHell is a parser for MediaWiki wikicode.", | |||
long_description = long_docs, | |||
download_url = "https://github.com/earwig/mwparserfromhell/tarball/v{}".format(__version__), | |||
keywords = "earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing", | |||
license = "MIT License", | |||
classifiers = [ | |||
name="mwparserfromhell", | |||
packages=find_packages("src"), | |||
package_dir={"": "src"}, | |||
ext_modules=[tokenizer] if use_extension else [], | |||
setup_requires=["pytest-runner"] | |||
if "test" in sys.argv or "pytest" in sys.argv | |||
else [], | |||
tests_require=["pytest"], | |||
version=__version__, | |||
python_requires=">= 3.5", | |||
author="Ben Kurtovic", | |||
author_email="ben.kurtovic@gmail.com", | |||
url="https://github.com/earwig/mwparserfromhell", | |||
description="MWParserFromHell is a parser for MediaWiki wikicode.", | |||
long_description=long_docs, | |||
download_url="https://github.com/earwig/mwparserfromhell/tarball/v{}".format( | |||
__version__ | |||
), | |||
keywords="earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing", | |||
license="MIT License", | |||
classifiers=[ | |||
"Development Status :: 4 - Beta", | |||
"Environment :: Console", | |||
"Intended Audience :: Developers", | |||
@@ -101,6 +110,6 @@ setup( | |||
"Programming Language :: Python :: 3.7", | |||
"Programming Language :: Python :: 3.8", | |||
"Programming Language :: Python :: 3.9", | |||
"Topic :: Text Processing :: Markup" | |||
"Topic :: Text Processing :: Markup", | |||
], | |||
) |
@@ -30,7 +30,6 @@ __license__ = "MIT License" | |||
__version__ = "0.7.dev0" | |||
__email__ = "ben.kurtovic@gmail.com" | |||
from . import (definitions, nodes, parser, smart_list, string_mixin, | |||
utils, wikicode) | |||
from . import definitions, nodes, parser, smart_list, string_mixin, utils, wikicode | |||
parse = utils.parse_anything |
@@ -26,8 +26,14 @@ When updating this file, please also update the the C tokenizer version: | |||
- mwparserfromhell/parser/ctokenizer/definitions.h | |||
""" | |||
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", | |||
"is_single_only", "is_scheme"] | |||
__all__ = [ | |||
"get_html_tag", | |||
"is_parsable", | |||
"is_visible", | |||
"is_single", | |||
"is_single_only", | |||
"is_scheme", | |||
] | |||
URI_SCHEMES = { | |||
# [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0 | |||
@@ -92,7 +98,7 @@ INVISIBLE_TAGS = [ | |||
"score", | |||
"section", | |||
"templatedata", | |||
"timeline" | |||
"timeline", | |||
] | |||
# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645 | |||
@@ -103,29 +109,35 @@ MARKUP_TO_HTML = { | |||
"#": "li", | |||
"*": "li", | |||
";": "dt", | |||
":": "dd" | |||
":": "dd", | |||
} | |||
def get_html_tag(markup): | |||
"""Return the HTML tag associated with the given wiki-markup.""" | |||
return MARKUP_TO_HTML[markup] | |||
def is_parsable(tag): | |||
"""Return if the given *tag*'s contents should be passed to the parser.""" | |||
return tag.lower() not in PARSER_BLACKLIST | |||
def is_visible(tag): | |||
"""Return whether or not the given *tag* contains visible text.""" | |||
return tag.lower() not in INVISIBLE_TAGS | |||
def is_single(tag): | |||
"""Return whether or not the given *tag* can exist without a close tag.""" | |||
return tag.lower() in SINGLE | |||
def is_single_only(tag): | |||
"""Return whether or not the given *tag* must exist without a close tag.""" | |||
return tag.lower() in SINGLE_ONLY | |||
def is_scheme(scheme, slashes=True): | |||
"""Return whether *scheme* is valid for external links.""" | |||
scheme = scheme.lower() | |||
@@ -39,5 +39,15 @@ from .tag import Tag | |||
from .template import Template | |||
from .wikilink import Wikilink | |||
__all__ = ["Argument", "Comment", "ExternalLink", "HTMLEntity", "Heading", | |||
"Node", "Tag", "Template", "Text", "Wikilink"] | |||
__all__ = [ | |||
"Argument", | |||
"Comment", | |||
"ExternalLink", | |||
"HTMLEntity", | |||
"Heading", | |||
"Node", | |||
"Tag", | |||
"Template", | |||
"Text", | |||
"Wikilink", | |||
] |
@@ -22,6 +22,7 @@ from ..string_mixin import StringMixIn | |||
__all__ = ["Node"] | |||
class Node(StringMixIn): | |||
"""Represents the base Node type, demonstrating the methods to override. | |||
@@ -35,6 +36,7 @@ class Node(StringMixIn): | |||
:meth:`__showtree__` can be overridden to build a nice tree representation | |||
of the node, if desired, for :meth:`~.Wikicode.get_tree`. | |||
""" | |||
def __str__(self): | |||
raise NotImplementedError() | |||
@@ -24,6 +24,7 @@ from ..utils import parse_anything | |||
__all__ = ["Argument"] | |||
class Argument(Node): | |||
"""Represents a template argument substitution, like ``{{{foo}}}``.""" | |||
@@ -23,6 +23,7 @@ from ._base import Node | |||
__all__ = ["Comment"] | |||
class Comment(Node): | |||
"""Represents a hidden HTML comment, like ``<!-- foobar -->``.""" | |||
@@ -24,6 +24,7 @@ from ..utils import parse_anything | |||
__all__ = ["ExternalLink"] | |||
class ExternalLink(Node): | |||
"""Represents an external link, like ``[http://example.com/ Example]``.""" | |||
@@ -83,6 +84,7 @@ class ExternalLink(Node): | |||
def url(self, value): | |||
# pylint: disable=import-outside-toplevel | |||
from ..parser import contexts | |||
self._url = parse_anything(value, contexts.EXT_LINK_URI) | |||
@title.setter | |||
@@ -24,6 +24,7 @@ from ...utils import parse_anything | |||
__all__ = ["Attribute"] | |||
class Attribute(StringMixIn): | |||
"""Represents an attribute of an HTML tag. | |||
@@ -32,8 +33,15 @@ class Attribute(StringMixIn): | |||
whose value is ``"foo"``. | |||
""" | |||
def __init__(self, name, value=None, quotes='"', pad_first=" ", | |||
pad_before_eq="", pad_after_eq=""): | |||
def __init__( | |||
self, | |||
name, | |||
value=None, | |||
quotes='"', | |||
pad_first=" ", | |||
pad_before_eq="", | |||
pad_after_eq="", | |||
): | |||
super().__init__() | |||
self.name = name | |||
self._quotes = None | |||
@@ -25,6 +25,7 @@ from ...utils import parse_anything | |||
__all__ = ["Parameter"] | |||
class Parameter(StringMixIn): | |||
"""Represents a paramater of a template. | |||
@@ -77,6 +78,5 @@ class Parameter(StringMixIn): | |||
def showkey(self, newval): | |||
newval = bool(newval) | |||
if not newval and not self.can_hide_key(self.name): | |||
raise ValueError("parameter key {!r} cannot be hidden".format( | |||
self.name)) | |||
raise ValueError("parameter key {!r} cannot be hidden".format(self.name)) | |||
self._showkey = newval |
@@ -24,6 +24,7 @@ from ..utils import parse_anything | |||
__all__ = ["Heading"] | |||
class Heading(Node): | |||
"""Represents a section heading in wikicode, like ``== Foo ==``.""" | |||
@@ -24,6 +24,7 @@ from ._base import Node | |||
__all__ = ["HTMLEntity"] | |||
class HTMLEntity(Node): | |||
"""Represents an HTML entity, like `` ``, either named or unnamed.""" | |||
@@ -101,19 +102,23 @@ class HTMLEntity(Node): | |||
except ValueError: | |||
if newval not in htmlentities.entitydefs: | |||
raise ValueError( | |||
"entity value {!r} is not a valid name".format(newval)) from None | |||
"entity value {!r} is not a valid name".format(newval) | |||
) from None | |||
self._named = True | |||
self._hexadecimal = False | |||
else: | |||
if intval < 0 or intval > 0x10FFFF: | |||
raise ValueError( | |||
"entity value 0x{:x} is not in range(0x110000)".format(intval)) from None | |||
"entity value 0x{:x} is not in range(0x110000)".format(intval) | |||
) from None | |||
self._named = False | |||
self._hexadecimal = True | |||
else: | |||
test = int(newval, 16 if self.hexadecimal else 10) | |||
if test < 0 or test > 0x10FFFF: | |||
raise ValueError("entity value {} is not in range(0x110000)".format(test)) | |||
raise ValueError( | |||
"entity value {} is not in range(0x110000)".format(test) | |||
) | |||
self._named = False | |||
self._value = newval | |||
@@ -126,8 +131,10 @@ class HTMLEntity(Node): | |||
try: | |||
int(self.value, 16) | |||
except ValueError as exc: | |||
raise ValueError("current entity value {!r} is not a valid " | |||
"Unicode codepoint".format(self.value)) from exc | |||
raise ValueError( | |||
"current entity value {!r} is not a valid " | |||
"Unicode codepoint".format(self.value) | |||
) from exc | |||
self._named = newval | |||
@hexadecimal.setter | |||
@@ -26,13 +26,24 @@ from ..utils import parse_anything | |||
__all__ = ["Tag"] | |||
class Tag(Node): | |||
"""Represents an HTML-style tag in wikicode, like ``<ref>``.""" | |||
def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, | |||
self_closing=False, invalid=False, implicit=False, padding="", | |||
closing_tag=None, wiki_style_separator=None, | |||
closing_wiki_markup=None): | |||
def __init__( | |||
self, | |||
tag, | |||
contents=None, | |||
attrs=None, | |||
wiki_markup=None, | |||
self_closing=False, | |||
invalid=False, | |||
implicit=False, | |||
padding="", | |||
closing_tag=None, | |||
wiki_style_separator=None, | |||
closing_wiki_markup=None, | |||
): | |||
super().__init__() | |||
self.tag = tag | |||
self.contents = contents | |||
@@ -60,8 +71,14 @@ class Tag(Node): | |||
if self.self_closing: | |||
return self.wiki_markup + attrs + padding + separator | |||
close = self.closing_wiki_markup or "" | |||
return self.wiki_markup + attrs + padding + separator + \ | |||
str(self.contents) + close | |||
return ( | |||
self.wiki_markup | |||
+ attrs | |||
+ padding | |||
+ separator | |||
+ str(self.contents) | |||
+ close | |||
) | |||
result = ("</" if self.invalid else "<") + str(self.tag) | |||
if self.attributes: | |||
@@ -270,8 +287,15 @@ class Tag(Node): | |||
return attr | |||
raise ValueError(name) | |||
def add(self, name, value=None, quotes='"', pad_first=" ", | |||
pad_before_eq="", pad_after_eq=""): | |||
def add( | |||
self, | |||
name, | |||
value=None, | |||
quotes='"', | |||
pad_first=" ", | |||
pad_before_eq="", | |||
pad_after_eq="", | |||
): | |||
"""Add an attribute with the given *name* and *value*. | |||
*name* and *value* can be anything parsable by | |||
@@ -33,6 +33,7 @@ FLAGS = re.DOTALL | re.UNICODE | |||
# Used to allow None as a valid fallback value | |||
_UNSET = object() | |||
class Template(Node): | |||
"""Represents a template in wikicode, like ``{{foo}}``.""" | |||
@@ -153,7 +154,7 @@ class Template(Node): | |||
def _fix_dependendent_params(self, i): | |||
"""Unhide keys if necessary after removing the param at index *i*.""" | |||
if not self.params[i].showkey: | |||
for param in self.params[i + 1:]: | |||
for param in self.params[i + 1 :]: | |||
if not param.showkey: | |||
param.showkey = True | |||
@@ -175,9 +176,10 @@ class Template(Node): | |||
If one exists, we should remove the given one rather than blanking it. | |||
""" | |||
if self.params[i].showkey: | |||
following = self.params[i + 1:] | |||
better_matches = [after.name.strip() == name and not after.showkey | |||
for after in following] | |||
following = self.params[i + 1 :] | |||
better_matches = [ | |||
after.name.strip() == name and not after.showkey for after in following | |||
] | |||
return any(better_matches) | |||
return False | |||
@@ -235,8 +237,7 @@ class Template(Node): | |||
def __getitem__(self, name): | |||
return self.get(name) | |||
def add(self, name, value, showkey=None, before=None, | |||
preserve_spacing=True): | |||
def add(self, name, value, showkey=None, before=None, preserve_spacing=True): | |||
"""Add a parameter to the template with a given *name* and *value*. | |||
*name* and *value* can be anything parsable by | |||
@@ -23,6 +23,7 @@ from ._base import Node | |||
__all__ = ["Text"] | |||
class Text(Node): | |||
"""Represents ordinary, unformatted text with no special properties.""" | |||
@@ -24,6 +24,7 @@ from ..utils import parse_anything | |||
__all__ = ["Wikilink"] | |||
class Wikilink(Node): | |||
"""Represents an internal wikilink, like ``[[Foo|Bar]]``.""" | |||
@@ -26,16 +26,20 @@ together into one interface. | |||
from .builder import Builder | |||
from .errors import ParserError | |||
try: | |||
from ._tokenizer import CTokenizer | |||
use_c = True | |||
except ImportError: | |||
from .tokenizer import Tokenizer | |||
CTokenizer = None | |||
use_c = False | |||
__all__ = ["use_c", "Parser", "ParserError"] | |||
class Parser: | |||
"""Represents a parser for wikicode. | |||
@@ -57,6 +61,7 @@ class Parser: | |||
self._tokenizer = CTokenizer() | |||
else: | |||
from .tokenizer import Tokenizer | |||
self._tokenizer = Tokenizer() | |||
self._builder = Builder() | |||
@@ -21,24 +21,34 @@ | |||
from . import tokens | |||
from .errors import ParserError | |||
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, | |||
Template, Text, Wikilink) | |||
from ..nodes import ( | |||
Argument, | |||
Comment, | |||
ExternalLink, | |||
Heading, | |||
HTMLEntity, | |||
Tag, | |||
Template, | |||
Text, | |||
Wikilink, | |||
) | |||
from ..nodes.extras import Attribute, Parameter | |||
from ..smart_list import SmartList | |||
from ..wikicode import Wikicode | |||
__all__ = ["Builder"] | |||
_HANDLERS = { | |||
tokens.Text: lambda self, token: Text(token.text) | |||
} | |||
_HANDLERS = {tokens.Text: lambda self, token: Text(token.text)} | |||
def _add_handler(token_type): | |||
"""Create a decorator that adds a handler function to the lookup table.""" | |||
def decorator(func): | |||
"""Add a handler function to the lookup table.""" | |||
_HANDLERS[token_type] = func | |||
return func | |||
return decorator | |||
@@ -84,8 +94,9 @@ class Builder: | |||
key = self._pop() | |||
showkey = True | |||
self._push() | |||
elif isinstance(token, (tokens.TemplateParamSeparator, | |||
tokens.TemplateClose)): | |||
elif isinstance( | |||
token, (tokens.TemplateParamSeparator, tokens.TemplateClose) | |||
): | |||
self._tokens.append(token) | |||
value = self._pop() | |||
if key is None: | |||
@@ -167,10 +178,17 @@ class Builder: | |||
self._push() | |||
elif isinstance(token, tokens.ExternalLinkClose): | |||
if url is not None: | |||
return ExternalLink(url, self._pop(), brackets=brackets, | |||
suppress_space=suppress_space is True) | |||
return ExternalLink(self._pop(), brackets=brackets, | |||
suppress_space=suppress_space is True) | |||
return ExternalLink( | |||
url, | |||
self._pop(), | |||
brackets=brackets, | |||
suppress_space=suppress_space is True, | |||
) | |||
return ExternalLink( | |||
self._pop(), | |||
brackets=brackets, | |||
suppress_space=suppress_space is True, | |||
) | |||
else: | |||
self._write(self._handle_token(token)) | |||
raise ParserError("_handle_external_link() missed a close token") | |||
@@ -184,8 +202,9 @@ class Builder: | |||
if isinstance(token, tokens.HTMLEntityHex): | |||
text = self._tokens.pop() | |||
self._tokens.pop() # Remove HTMLEntityEnd | |||
return HTMLEntity(text.text, named=False, hexadecimal=True, | |||
hex_char=token.char) | |||
return HTMLEntity( | |||
text.text, named=False, hexadecimal=True, hex_char=token.char | |||
) | |||
self._tokens.pop() # Remove HTMLEntityEnd | |||
return HTMLEntity(token.text, named=False, hexadecimal=False) | |||
self._tokens.pop() # Remove HTMLEntityEnd | |||
@@ -227,15 +246,23 @@ class Builder: | |||
self._push() | |||
elif isinstance(token, tokens.TagAttrQuote): | |||
quotes = token.char | |||
elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen, | |||
tokens.TagCloseSelfclose)): | |||
elif isinstance( | |||
token, | |||
(tokens.TagAttrStart, tokens.TagCloseOpen, tokens.TagCloseSelfclose), | |||
): | |||
self._tokens.append(token) | |||
if name: | |||
value = self._pop() | |||
else: | |||
name, value = self._pop(), None | |||
return Attribute(name, value, quotes, start.pad_first, | |||
start.pad_before_eq, start.pad_after_eq) | |||
return Attribute( | |||
name, | |||
value, | |||
quotes, | |||
start.pad_first, | |||
start.pad_before_eq, | |||
start.pad_after_eq, | |||
) | |||
else: | |||
self._write(self._handle_token(token)) | |||
raise ParserError("_handle_attribute() missed a close token") | |||
@@ -271,9 +298,19 @@ class Builder: | |||
else: | |||
self_closing = False | |||
closing_tag = self._pop() | |||
return Tag(tag, contents, attrs, wiki_markup, self_closing, | |||
invalid, implicit, padding, closing_tag, | |||
wiki_style_separator, closing_wiki_markup) | |||
return Tag( | |||
tag, | |||
contents, | |||
attrs, | |||
wiki_markup, | |||
self_closing, | |||
invalid, | |||
implicit, | |||
padding, | |||
closing_tag, | |||
wiki_style_separator, | |||
closing_wiki_markup, | |||
) | |||
else: | |||
self._write(self._handle_token(token)) | |||
raise ParserError("_handle_tag() missed a close token") | |||
@@ -116,21 +116,21 @@ Aggregate contexts: | |||
# Local contexts: | |||
TEMPLATE_NAME = 1 << 0 | |||
TEMPLATE_PARAM_KEY = 1 << 1 | |||
TEMPLATE_NAME = 1 << 0 | |||
TEMPLATE_PARAM_KEY = 1 << 1 | |||
TEMPLATE_PARAM_VALUE = 1 << 2 | |||
TEMPLATE = TEMPLATE_NAME + TEMPLATE_PARAM_KEY + TEMPLATE_PARAM_VALUE | |||
ARGUMENT_NAME = 1 << 3 | |||
ARGUMENT_NAME = 1 << 3 | |||
ARGUMENT_DEFAULT = 1 << 4 | |||
ARGUMENT = ARGUMENT_NAME + ARGUMENT_DEFAULT | |||
WIKILINK_TITLE = 1 << 5 | |||
WIKILINK_TEXT = 1 << 6 | |||
WIKILINK_TEXT = 1 << 6 | |||
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT | |||
EXT_LINK_URI = 1 << 7 | |||
EXT_LINK_TITLE = 1 << 8 | |||
EXT_LINK_URI = 1 << 7 | |||
EXT_LINK_TITLE = 1 << 8 | |||
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE | |||
HEADING_LEVEL_1 = 1 << 9 | |||
@@ -139,42 +139,61 @@ HEADING_LEVEL_3 = 1 << 11 | |||
HEADING_LEVEL_4 = 1 << 12 | |||
HEADING_LEVEL_5 = 1 << 13 | |||
HEADING_LEVEL_6 = 1 << 14 | |||
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + | |||
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) | |||
TAG_OPEN = 1 << 15 | |||
TAG_ATTR = 1 << 16 | |||
TAG_BODY = 1 << 17 | |||
HEADING = ( | |||
HEADING_LEVEL_1 | |||
+ HEADING_LEVEL_2 | |||
+ HEADING_LEVEL_3 | |||
+ HEADING_LEVEL_4 | |||
+ HEADING_LEVEL_5 | |||
+ HEADING_LEVEL_6 | |||
) | |||
TAG_OPEN = 1 << 15 | |||
TAG_ATTR = 1 << 16 | |||
TAG_BODY = 1 << 17 | |||
TAG_CLOSE = 1 << 18 | |||
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | |||
STYLE_ITALICS = 1 << 19 | |||
STYLE_BOLD = 1 << 20 | |||
STYLE_PASS_AGAIN = 1 << 21 | |||
STYLE_SECOND_PASS = 1 << 22 | |||
STYLE_ITALICS = 1 << 19 | |||
STYLE_BOLD = 1 << 20 | |||
STYLE_PASS_AGAIN = 1 << 21 | |||
STYLE_SECOND_PASS = 1 << 22 | |||
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS | |||
DL_TERM = 1 << 23 | |||
HAS_TEXT = 1 << 24 | |||
FAIL_ON_TEXT = 1 << 25 | |||
FAIL_NEXT = 1 << 26 | |||
HAS_TEXT = 1 << 24 | |||
FAIL_ON_TEXT = 1 << 25 | |||
FAIL_NEXT = 1 << 26 | |||
FAIL_ON_LBRACE = 1 << 27 | |||
FAIL_ON_RBRACE = 1 << 28 | |||
FAIL_ON_EQUALS = 1 << 29 | |||
HAS_TEMPLATE = 1 << 30 | |||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | |||
FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE) | |||
TABLE_OPEN = 1 << 31 | |||
TABLE_CELL_OPEN = 1 << 32 | |||
HAS_TEMPLATE = 1 << 30 | |||
SAFETY_CHECK = ( | |||
HAS_TEXT | |||
+ FAIL_ON_TEXT | |||
+ FAIL_NEXT | |||
+ FAIL_ON_LBRACE | |||
+ FAIL_ON_RBRACE | |||
+ FAIL_ON_EQUALS | |||
+ HAS_TEMPLATE | |||
) | |||
TABLE_OPEN = 1 << 31 | |||
TABLE_CELL_OPEN = 1 << 32 | |||
TABLE_CELL_STYLE = 1 << 33 | |||
TABLE_ROW_OPEN = 1 << 34 | |||
TABLE_TD_LINE = 1 << 35 | |||
TABLE_TH_LINE = 1 << 36 | |||
TABLE_ROW_OPEN = 1 << 34 | |||
TABLE_TD_LINE = 1 << 35 | |||
TABLE_TH_LINE = 1 << 36 | |||
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | |||
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | |||
TABLE_TD_LINE + TABLE_TH_LINE) | |||
TABLE = ( | |||
TABLE_OPEN | |||
+ TABLE_CELL_OPEN | |||
+ TABLE_CELL_STYLE | |||
+ TABLE_ROW_OPEN | |||
+ TABLE_TD_LINE | |||
+ TABLE_TH_LINE | |||
) | |||
HTML_ENTITY = 1 << 37 | |||
@@ -184,14 +203,20 @@ GL_HEADING = 1 << 0 | |||
# Aggregate contexts: | |||
FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + | |||
STYLE + TABLE) | |||
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + | |||
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) | |||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE | |||
UNSAFE = ( | |||
TEMPLATE_NAME | |||
+ WIKILINK_TITLE | |||
+ EXT_LINK_TITLE | |||
+ TEMPLATE_PARAM_KEY | |||
+ ARGUMENT_NAME | |||
+ TAG_CLOSE | |||
) | |||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN | |||
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI | |||
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK | |||
def describe(context): | |||
"""Return a string describing the given context value, for debugging.""" | |||
flags = [] | |||
@@ -1,6 +1,6 @@ | |||
/* | |||
* avl_tree.h - intrusive, nonrecursive AVL tree data structure (self-balancing | |||
* binary search tree), header file | |||
* binary search tree), header file | |||
* | |||
* Written in 2014-2016 by Eric Biggers <ebiggers3@gmail.com> | |||
* Slight changes for compatibility by Ben Kurtovic <ben.kurtovic@gmail.com> | |||
@@ -24,60 +24,60 @@ | |||
#include <stddef.h> | |||
#if !defined(_MSC_VER) || (_MSC_VER >= 1600) | |||
#include <stdint.h> | |||
# include <stdint.h> | |||
#endif | |||
#ifdef __GNUC__ | |||
# define AVL_INLINE inline __attribute__((always_inline)) | |||
# define AVL_INLINE inline __attribute__((always_inline)) | |||
#elif defined(_MSC_VER) && (_MSC_VER < 1900) | |||
# define AVL_INLINE __inline | |||
# define AVL_INLINE __inline | |||
#else | |||
# define AVL_INLINE inline | |||
# define AVL_INLINE inline | |||
#endif | |||
/* Node in an AVL tree. Embed this in some other data structure. */ | |||
struct avl_tree_node { | |||
/* Pointer to left child or NULL */ | |||
struct avl_tree_node *left; | |||
/* Pointer to left child or NULL */ | |||
struct avl_tree_node *left; | |||
/* Pointer to right child or NULL */ | |||
struct avl_tree_node *right; | |||
/* Pointer to right child or NULL */ | |||
struct avl_tree_node *right; | |||
/* Pointer to parent combined with the balance factor. This saves 4 or | |||
* 8 bytes of memory depending on the CPU architecture. | |||
* | |||
* Low 2 bits: One greater than the balance factor of this subtree, | |||
* which is equal to height(right) - height(left). The mapping is: | |||
* | |||
* 00 => -1 | |||
* 01 => 0 | |||
* 10 => +1 | |||
* 11 => undefined | |||
* | |||
* The rest of the bits are the pointer to the parent node. It must be | |||
* 4-byte aligned, and it will be NULL if this is the root node and | |||
* therefore has no parent. */ | |||
uintptr_t parent_balance; | |||
/* Pointer to parent combined with the balance factor. This saves 4 or | |||
* 8 bytes of memory depending on the CPU architecture. | |||
* | |||
* Low 2 bits: One greater than the balance factor of this subtree, | |||
* which is equal to height(right) - height(left). The mapping is: | |||
* | |||
* 00 => -1 | |||
* 01 => 0 | |||
* 10 => +1 | |||
* 11 => undefined | |||
* | |||
* The rest of the bits are the pointer to the parent node. It must be | |||
* 4-byte aligned, and it will be NULL if this is the root node and | |||
* therefore has no parent. */ | |||
uintptr_t parent_balance; | |||
}; | |||
/* Cast an AVL tree node to the containing data structure. */ | |||
#define avl_tree_entry(entry, type, member) \ | |||
((type*) ((char *)(entry) - offsetof(type, member))) | |||
#define avl_tree_entry(entry, type, member) \ | |||
((type *) ((char *) (entry) -offsetof(type, member))) | |||
/* Returns a pointer to the parent of the specified AVL tree node, or NULL if it | |||
* is already the root of the tree. */ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_get_parent(const struct avl_tree_node *node) | |||
{ | |||
return (struct avl_tree_node *)(node->parent_balance & ~3); | |||
return (struct avl_tree_node *) (node->parent_balance & ~3); | |||
} | |||
/* Marks the specified AVL tree node as unlinked from any tree. */ | |||
static AVL_INLINE void | |||
avl_tree_node_set_unlinked(struct avl_tree_node *node) | |||
{ | |||
node->parent_balance = (uintptr_t)node; | |||
node->parent_balance = (uintptr_t) node; | |||
} | |||
/* Returns true iff the specified AVL tree node has been marked with | |||
@@ -86,30 +86,29 @@ avl_tree_node_set_unlinked(struct avl_tree_node *node) | |||
static AVL_INLINE int | |||
avl_tree_node_is_unlinked(const struct avl_tree_node *node) | |||
{ | |||
return node->parent_balance == (uintptr_t)node; | |||
return node->parent_balance == (uintptr_t) node; | |||
} | |||
/* (Internal use only) */ | |||
extern void | |||
avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *inserted); | |||
extern void avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *inserted); | |||
/* | |||
* Looks up an item in the specified AVL tree. | |||
* | |||
* @root | |||
* Pointer to the root of the AVL tree. (This can be NULL --- that just | |||
* means the tree is empty.) | |||
* Pointer to the root of the AVL tree. (This can be NULL --- that just | |||
* means the tree is empty.) | |||
* | |||
* @cmp_ctx | |||
* First argument to pass to the comparison callback. This generally | |||
* should be a pointer to an object equal to the one being searched for. | |||
* First argument to pass to the comparison callback. This generally | |||
* should be a pointer to an object equal to the one being searched for. | |||
* | |||
* @cmp | |||
* Comparison callback. Must return < 0, 0, or > 0 if the first argument | |||
* is less than, equal to, or greater than the second argument, | |||
* respectively. The first argument will be @cmp_ctx and the second | |||
* argument will be a pointer to the AVL tree node of an item in the tree. | |||
* Comparison callback. Must return < 0, 0, or > 0 if the first argument | |||
* is less than, equal to, or greater than the second argument, | |||
* respectively. The first argument will be @cmp_ctx and the second | |||
* argument will be a pointer to the AVL tree node of an item in the tree. | |||
* | |||
* Returns a pointer to the AVL tree node of the resulting item, or NULL if the | |||
* item was not found. | |||
@@ -117,48 +116,49 @@ avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, | |||
* Example: | |||
* | |||
* struct int_wrapper { | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* }; | |||
* | |||
* static int _avl_cmp_int_to_node(const void *intptr, | |||
* const struct avl_tree_node *nodeptr) | |||
* const struct avl_tree_node *nodeptr) | |||
* { | |||
* int n1 = *(const int *)intptr; | |||
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data; | |||
* if (n1 < n2) | |||
* return -1; | |||
* else if (n1 > n2) | |||
* return 1; | |||
* else | |||
* return 0; | |||
* int n1 = *(const int *)intptr; | |||
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data; | |||
* if (n1 < n2) | |||
* return -1; | |||
* else if (n1 > n2) | |||
* return 1; | |||
* else | |||
* return 0; | |||
* } | |||
* | |||
* bool contains_int(struct avl_tree_node *root, int n) | |||
* { | |||
* struct avl_tree_node *result; | |||
* struct avl_tree_node *result; | |||
* | |||
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node); | |||
* return result ? true : false; | |||
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node); | |||
* return result ? true : false; | |||
* } | |||
*/ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_lookup(const struct avl_tree_node *root, | |||
const void *cmp_ctx, | |||
int (*cmp)(const void *, const struct avl_tree_node *)) | |||
const void *cmp_ctx, | |||
int (*cmp)(const void *, const struct avl_tree_node *)) | |||
{ | |||
const struct avl_tree_node *cur = root; | |||
const struct avl_tree_node *cur = root; | |||
while (cur) { | |||
int res = (*cmp)(cmp_ctx, cur); | |||
if (res < 0) | |||
cur = cur->left; | |||
else if (res > 0) | |||
cur = cur->right; | |||
else | |||
break; | |||
} | |||
return (struct avl_tree_node*)cur; | |||
while (cur) { | |||
int res = (*cmp)(cmp_ctx, cur); | |||
if (res < 0) { | |||
cur = cur->left; | |||
} else if (res > 0) { | |||
cur = cur->right; | |||
} else { | |||
break; | |||
} | |||
} | |||
return (struct avl_tree_node *) cur; | |||
} | |||
/* Same as avl_tree_lookup(), but uses a more specific type for the comparison | |||
@@ -167,44 +167,45 @@ avl_tree_lookup(const struct avl_tree_node *root, | |||
* embedded 'struct avl_tree_node'. */ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_lookup_node(const struct avl_tree_node *root, | |||
const struct avl_tree_node *node, | |||
int (*cmp)(const struct avl_tree_node *, | |||
const struct avl_tree_node *)) | |||
const struct avl_tree_node *node, | |||
int (*cmp)(const struct avl_tree_node *, | |||
const struct avl_tree_node *)) | |||
{ | |||
const struct avl_tree_node *cur = root; | |||
const struct avl_tree_node *cur = root; | |||
while (cur) { | |||
int res = (*cmp)(node, cur); | |||
if (res < 0) | |||
cur = cur->left; | |||
else if (res > 0) | |||
cur = cur->right; | |||
else | |||
break; | |||
} | |||
return (struct avl_tree_node*)cur; | |||
while (cur) { | |||
int res = (*cmp)(node, cur); | |||
if (res < 0) { | |||
cur = cur->left; | |||
} else if (res > 0) { | |||
cur = cur->right; | |||
} else { | |||
break; | |||
} | |||
} | |||
return (struct avl_tree_node *) cur; | |||
} | |||
/* | |||
* Inserts an item into the specified AVL tree. | |||
* | |||
* @root_ptr | |||
* Location of the AVL tree's root pointer. Indirection is needed because | |||
* the root node may change as a result of rotations caused by the | |||
* insertion. Initialize *root_ptr to NULL for an empty tree. | |||
* Location of the AVL tree's root pointer. Indirection is needed because | |||
* the root node may change as a result of rotations caused by the | |||
* insertion. Initialize *root_ptr to NULL for an empty tree. | |||
* | |||
* @item | |||
* Pointer to the `struct avl_tree_node' embedded in the item to insert. | |||
* No members in it need be pre-initialized, although members in the | |||
* containing structure should be pre-initialized so that @cmp can use them | |||
* in comparisons. | |||
* Pointer to the `struct avl_tree_node' embedded in the item to insert. | |||
* No members in it need be pre-initialized, although members in the | |||
* containing structure should be pre-initialized so that @cmp can use them | |||
* in comparisons. | |||
* | |||
* @cmp | |||
* Comparison callback. Must return < 0, 0, or > 0 if the first argument | |||
* is less than, equal to, or greater than the second argument, | |||
* respectively. The first argument will be @item and the second | |||
* argument will be a pointer to an AVL tree node embedded in some | |||
* previously-inserted item to which @item is being compared. | |||
* Comparison callback. Must return < 0, 0, or > 0 if the first argument | |||
* is less than, equal to, or greater than the second argument, | |||
* respectively. The first argument will be @item and the second | |||
* argument will be a pointer to an AVL tree node embedded in some | |||
* previously-inserted item to which @item is being compared. | |||
* | |||
* If no item in the tree is comparatively equal (via @cmp) to @item, inserts | |||
* @item and returns NULL. Otherwise does nothing and returns a pointer to the | |||
@@ -214,150 +215,138 @@ avl_tree_lookup_node(const struct avl_tree_node *root, | |||
* Example: | |||
* | |||
* struct int_wrapper { | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* }; | |||
* | |||
* #define GET_DATA(i) avl_tree_entry((i), struct int_wrapper, index_node)->data | |||
* | |||
* static int _avl_cmp_ints(const struct avl_tree_node *node1, | |||
* const struct avl_tree_node *node2) | |||
* const struct avl_tree_node *node2) | |||
* { | |||
* int n1 = GET_DATA(node1); | |||
* int n2 = GET_DATA(node2); | |||
* if (n1 < n2) | |||
* return -1; | |||
* else if (n1 > n2) | |||
* return 1; | |||
* else | |||
* return 0; | |||
* int n1 = GET_DATA(node1); | |||
* int n2 = GET_DATA(node2); | |||
* if (n1 < n2) | |||
* return -1; | |||
* else if (n1 > n2) | |||
* return 1; | |||
* else | |||
* return 0; | |||
* } | |||
* | |||
* bool insert_int(struct avl_tree_node **root_ptr, int data) | |||
* { | |||
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper)); | |||
* i->data = data; | |||
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) { | |||
* // Duplicate. | |||
* free(i); | |||
* return false; | |||
* } | |||
* return true; | |||
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper)); | |||
* i->data = data; | |||
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) { | |||
* // Duplicate. | |||
* free(i); | |||
* return false; | |||
* } | |||
* return true; | |||
* } | |||
*/ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_insert(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *item, | |||
int (*cmp)(const struct avl_tree_node *, | |||
const struct avl_tree_node *)) | |||
struct avl_tree_node *item, | |||
int (*cmp)(const struct avl_tree_node *, const struct avl_tree_node *)) | |||
{ | |||
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL; | |||
int res; | |||
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL; | |||
int res; | |||
while (*cur_ptr) { | |||
cur = *cur_ptr; | |||
res = (*cmp)(item, cur); | |||
if (res < 0) | |||
cur_ptr = &cur->left; | |||
else if (res > 0) | |||
cur_ptr = &cur->right; | |||
else | |||
return cur; | |||
} | |||
*cur_ptr = item; | |||
item->parent_balance = (uintptr_t)cur | 1; | |||
avl_tree_rebalance_after_insert(root_ptr, item); | |||
return NULL; | |||
while (*cur_ptr) { | |||
cur = *cur_ptr; | |||
res = (*cmp)(item, cur); | |||
if (res < 0) { | |||
cur_ptr = &cur->left; | |||
} else if (res > 0) { | |||
cur_ptr = &cur->right; | |||
} else { | |||
return cur; | |||
} | |||
} | |||
*cur_ptr = item; | |||
item->parent_balance = (uintptr_t) cur | 1; | |||
avl_tree_rebalance_after_insert(root_ptr, item); | |||
return NULL; | |||
} | |||
/* Removes an item from the specified AVL tree. | |||
* See implementation for details. */ | |||
extern void | |||
avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node); | |||
extern void avl_tree_remove(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *node); | |||
/* Nonrecursive AVL tree traversal functions */ | |||
extern struct avl_tree_node * | |||
avl_tree_first_in_order(const struct avl_tree_node *root); | |||
extern struct avl_tree_node *avl_tree_first_in_order(const struct avl_tree_node *root); | |||
extern struct avl_tree_node * | |||
avl_tree_last_in_order(const struct avl_tree_node *root); | |||
extern struct avl_tree_node *avl_tree_last_in_order(const struct avl_tree_node *root); | |||
extern struct avl_tree_node * | |||
avl_tree_next_in_order(const struct avl_tree_node *node); | |||
extern struct avl_tree_node *avl_tree_next_in_order(const struct avl_tree_node *node); | |||
extern struct avl_tree_node * | |||
avl_tree_prev_in_order(const struct avl_tree_node *node); | |||
extern struct avl_tree_node *avl_tree_prev_in_order(const struct avl_tree_node *node); | |||
extern struct avl_tree_node * | |||
avl_tree_first_in_postorder(const struct avl_tree_node *root); | |||
extern struct avl_tree_node * | |||
avl_tree_next_in_postorder(const struct avl_tree_node *prev, | |||
const struct avl_tree_node *prev_parent); | |||
const struct avl_tree_node *prev_parent); | |||
/* | |||
* Iterate through the nodes in an AVL tree in sorted order. | |||
* You may not modify the tree during the iteration. | |||
* | |||
* @child_struct | |||
* Variable that will receive a pointer to each struct inserted into the | |||
* tree. | |||
* Variable that will receive a pointer to each struct inserted into the | |||
* tree. | |||
* @root | |||
* Root of the AVL tree. | |||
* Root of the AVL tree. | |||
* @struct_name | |||
* Type of *child_struct. | |||
* Type of *child_struct. | |||
* @struct_member | |||
* Member of @struct_name type that is the AVL tree node. | |||
* Member of @struct_name type that is the AVL tree node. | |||
* | |||
* Example: | |||
* | |||
* struct int_wrapper { | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* }; | |||
* | |||
* void print_ints(struct avl_tree_node *root) | |||
* { | |||
* struct int_wrapper *i; | |||
* struct int_wrapper *i; | |||
* | |||
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node) | |||
* printf("%d\n", i->data); | |||
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node) | |||
* printf("%d\n", i->data); | |||
* } | |||
*/ | |||
#define avl_tree_for_each_in_order(child_struct, root, \ | |||
struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = \ | |||
avl_tree_first_in_order(root); \ | |||
_cur && ((child_struct) = \ | |||
avl_tree_entry(_cur, struct_name, \ | |||
struct_member), 1); \ | |||
_cur = avl_tree_next_in_order(_cur)) | |||
#define avl_tree_for_each_in_order(child_struct, root, struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = avl_tree_first_in_order(root); \ | |||
_cur && \ | |||
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1); \ | |||
_cur = avl_tree_next_in_order(_cur)) | |||
/* | |||
* Like avl_tree_for_each_in_order(), but uses the reverse order. | |||
*/ | |||
#define avl_tree_for_each_in_reverse_order(child_struct, root, \ | |||
struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = \ | |||
avl_tree_last_in_order(root); \ | |||
_cur && ((child_struct) = \ | |||
avl_tree_entry(_cur, struct_name, \ | |||
struct_member), 1); \ | |||
_cur = avl_tree_prev_in_order(_cur)) | |||
#define avl_tree_for_each_in_reverse_order( \ | |||
child_struct, root, struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = avl_tree_last_in_order(root); \ | |||
_cur && \ | |||
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1); \ | |||
_cur = avl_tree_prev_in_order(_cur)) | |||
/* | |||
* Like avl_tree_for_each_in_order(), but iterates through the nodes in | |||
* postorder, so the current node may be deleted or freed. | |||
*/ | |||
#define avl_tree_for_each_in_postorder(child_struct, root, \ | |||
struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = \ | |||
avl_tree_first_in_postorder(root), *_parent; \ | |||
_cur && ((child_struct) = \ | |||
avl_tree_entry(_cur, struct_name, \ | |||
struct_member), 1) \ | |||
&& (_parent = avl_get_parent(_cur), 1); \ | |||
_cur = avl_tree_next_in_postorder(_cur, _parent)) | |||
#define avl_tree_for_each_in_postorder(child_struct, root, struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = avl_tree_first_in_postorder(root), *_parent; \ | |||
_cur && \ | |||
((child_struct) = avl_tree_entry(_cur, struct_name, struct_member), 1) && \ | |||
(_parent = avl_get_parent(_cur), 1); \ | |||
_cur = avl_tree_next_in_postorder(_cur, _parent)) | |||
#endif /* _AVL_TREE_H_ */ |
@@ -23,55 +23,56 @@ SOFTWARE. | |||
#pragma once | |||
#ifndef PY_SSIZE_T_CLEAN | |||
#define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html | |||
# define PY_SSIZE_T_CLEAN // See: https://docs.python.org/3/c-api/arg.html | |||
#endif | |||
#include <Python.h> | |||
#include <structmember.h> | |||
#include <bytesobject.h> | |||
#include <structmember.h> | |||
#include "avl_tree.h" | |||
/* Compatibility macros */ | |||
#ifndef uint64_t | |||
#define uint64_t unsigned PY_LONG_LONG | |||
# define uint64_t unsigned PY_LONG_LONG | |||
#endif | |||
#define malloc PyObject_Malloc // XXX: yuck | |||
#define malloc PyObject_Malloc // XXX: yuck | |||
#define realloc PyObject_Realloc | |||
#define free PyObject_Free | |||
/* Unicode support macros */ | |||
#define PyUnicode_FROM_SINGLE(chr) \ | |||
#define PyUnicode_FROM_SINGLE(chr) \ | |||
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) | |||
/* Error handling macros */ | |||
#define BAD_ROUTE self->route_state | |||
#define BAD_ROUTE_CONTEXT self->route_context | |||
#define FAIL_ROUTE(context) { \ | |||
self->route_state = 1; \ | |||
self->route_context = context; \ | |||
} | |||
#define RESET_ROUTE() self->route_state = 0 | |||
#define BAD_ROUTE self->route_state | |||
#define BAD_ROUTE_CONTEXT self->route_context | |||
#define FAIL_ROUTE(context) \ | |||
do { \ | |||
self->route_state = 1; \ | |||
self->route_context = context; \ | |||
} while (0) | |||
#define RESET_ROUTE() self->route_state = 0 | |||
/* Shared globals */ | |||
extern char** entitydefs; | |||
extern char **entitydefs; | |||
extern PyObject* NOARGS; | |||
extern PyObject* definitions; | |||
extern PyObject *NOARGS; | |||
extern PyObject *definitions; | |||
/* Structs */ | |||
typedef struct { | |||
Py_ssize_t capacity; | |||
Py_ssize_t length; | |||
PyObject* object; | |||
PyObject *object; | |||
int kind; | |||
void* data; | |||
void *data; | |||
} Textbuffer; | |||
typedef struct { | |||
@@ -80,19 +81,19 @@ typedef struct { | |||
} StackIdent; | |||
struct Stack { | |||
PyObject* stack; | |||
PyObject *stack; | |||
uint64_t context; | |||
Textbuffer* textbuffer; | |||
Textbuffer *textbuffer; | |||
StackIdent ident; | |||
struct Stack* next; | |||
struct Stack *next; | |||
}; | |||
typedef struct Stack Stack; | |||
typedef struct { | |||
PyObject* object; /* base PyUnicodeObject object */ | |||
Py_ssize_t length; /* length of object, in code points */ | |||
int kind; /* object's kind value */ | |||
void* data; /* object's raw unicode buffer */ | |||
PyObject *object; /* base PyUnicodeObject object */ | |||
Py_ssize_t length; /* length of object, in code points */ | |||
int kind; /* object's kind value */ | |||
void *data; /* object's raw unicode buffer */ | |||
} TokenizerInput; | |||
typedef struct avl_tree_node avl_tree; | |||
@@ -104,13 +105,13 @@ typedef struct { | |||
typedef struct { | |||
PyObject_HEAD | |||
TokenizerInput text; /* text to tokenize */ | |||
Stack* topstack; /* topmost stack */ | |||
Py_ssize_t head; /* current position in text */ | |||
int global; /* global context */ | |||
int depth; /* stack recursion depth */ | |||
int route_state; /* whether a BadRoute has been triggered */ | |||
uint64_t route_context; /* context when the last BadRoute was triggered */ | |||
avl_tree* bad_routes; /* stack idents for routes known to fail */ | |||
int skip_style_tags; /* temp fix for the sometimes broken tag parser */ | |||
TokenizerInput text; /* text to tokenize */ | |||
Stack *topstack; /* topmost stack */ | |||
Py_ssize_t head; /* current position in text */ | |||
int global; /* global context */ | |||
int depth; /* stack recursion depth */ | |||
int route_state; /* whether a BadRoute has been triggered */ | |||
uint64_t route_context; /* context when the last BadRoute was triggered */ | |||
avl_tree *bad_routes; /* stack idents for routes known to fail */ | |||
int skip_style_tags; /* temp fix for the sometimes broken tag parser */ | |||
} Tokenizer; |
@@ -89,11 +89,17 @@ SOFTWARE. | |||
/* Aggregate contexts */ | |||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) | |||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) | |||
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | |||
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | |||
#define AGG_FAIL \ | |||
(LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | \ | |||
LC_TAG | LC_STYLE | LC_TABLE_OPEN) | |||
#define AGG_UNSAFE \ | |||
(LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | \ | |||
LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) | |||
#define AGG_NO_WIKILINKS \ | |||
(LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | |||
#define AGG_NO_EXT_LINKS \ | |||
(LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | |||
/* Tag contexts */ | |||
@@ -27,7 +27,8 @@ SOFTWARE. | |||
See the Python version for data sources. | |||
*/ | |||
static const char* URI_SCHEMES[] = { | |||
// clang-format off | |||
static const char *URI_SCHEMES[] = { | |||
"bitcoin", | |||
"ftp", | |||
"ftps", | |||
@@ -55,10 +56,10 @@ static const char* URI_SCHEMES[] = { | |||
"urn", | |||
"worldwind", | |||
"xmpp", | |||
NULL, | |||
NULL, | |||
}; | |||
static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = { | |||
static const char *URI_SCHEMES_AUTHORITY_OPTIONAL[] = { | |||
"bitcoin", | |||
"geo", | |||
"magnet", | |||
@@ -73,7 +74,7 @@ static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = { | |||
NULL, | |||
}; | |||
static const char* PARSER_BLACKLIST[] = { | |||
static const char *PARSER_BLACKLIST[] = { | |||
"categorytree", | |||
"ce", | |||
"chem", | |||
@@ -93,32 +94,32 @@ static const char* PARSER_BLACKLIST[] = { | |||
"timeline", | |||
NULL, | |||
}; | |||
// clang-format on | |||
static const char* SINGLE[] = { | |||
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", | |||
"tr", NULL | |||
}; | |||
static const char *SINGLE[] = { | |||
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL}; | |||
static const char* SINGLE_ONLY[] = { | |||
"br", "wbr", "hr", "meta", "link", "img", NULL | |||
}; | |||
static const char *SINGLE_ONLY[] = {"br", "wbr", "hr", "meta", "link", "img", NULL}; | |||
/* | |||
Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in | |||
the second argument. The caller must free the return value when finished. | |||
If the return value is NULL, the conversion failed and *string is not set. | |||
*/ | |||
static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string) | |||
static PyObject * | |||
unicode_to_lcase_ascii(PyObject *input, const char **string) | |||
{ | |||
PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes; | |||
if (!lower) | |||
if (!lower) { | |||
return NULL; | |||
} | |||
bytes = PyUnicode_AsASCIIString(lower); | |||
Py_DECREF(lower); | |||
if (!bytes) { | |||
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) | |||
if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) { | |||
PyErr_Clear(); | |||
} | |||
return NULL; | |||
} | |||
*string = PyBytes_AS_STRING(bytes); | |||
@@ -128,14 +129,16 @@ static PyObject* unicode_to_lcase_ascii(PyObject *input, const char **string) | |||
/* | |||
Return whether a PyUnicodeObject is in a list of lowercase ASCII strings. | |||
*/ | |||
static int unicode_in_string_list(PyObject *input, const char **list) | |||
static int | |||
unicode_in_string_list(PyObject *input, const char **list) | |||
{ | |||
const char *string; | |||
PyObject *temp = unicode_to_lcase_ascii(input, &string); | |||
int retval = 0; | |||
if (!temp) | |||
if (!temp) { | |||
return 0; | |||
} | |||
while (*list) { | |||
if (!strcmp(*(list++), string)) { | |||
@@ -144,7 +147,7 @@ static int unicode_in_string_list(PyObject *input, const char **list) | |||
} | |||
} | |||
end: | |||
end: | |||
Py_DECREF(temp); | |||
return retval; | |||
} | |||
@@ -152,7 +155,8 @@ static int unicode_in_string_list(PyObject *input, const char **list) | |||
/* | |||
Return if the given tag's contents should be passed to the parser. | |||
*/ | |||
int is_parsable(PyObject *tag) | |||
int | |||
is_parsable(PyObject *tag) | |||
{ | |||
return !unicode_in_string_list(tag, PARSER_BLACKLIST); | |||
} | |||
@@ -160,7 +164,8 @@ int is_parsable(PyObject *tag) | |||
/* | |||
Return whether or not the given tag can exist without a close tag. | |||
*/ | |||
int is_single(PyObject *tag) | |||
int | |||
is_single(PyObject *tag) | |||
{ | |||
return unicode_in_string_list(tag, SINGLE); | |||
} | |||
@@ -168,7 +173,8 @@ int is_single(PyObject *tag) | |||
/* | |||
Return whether or not the given tag must exist without a close tag. | |||
*/ | |||
int is_single_only(PyObject *tag) | |||
int | |||
is_single_only(PyObject *tag) | |||
{ | |||
return unicode_in_string_list(tag, SINGLE_ONLY); | |||
} | |||
@@ -176,10 +182,12 @@ int is_single_only(PyObject *tag) | |||
/* | |||
Return whether the given scheme is valid for external links. | |||
*/ | |||
int is_scheme(PyObject *scheme, int slashes) | |||
int | |||
is_scheme(PyObject *scheme, int slashes) | |||
{ | |||
if (slashes) | |||
if (slashes) { | |||
return unicode_in_string_list(scheme, URI_SCHEMES); | |||
else | |||
} else { | |||
return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL); | |||
} | |||
} |
@@ -28,12 +28,11 @@ SOFTWARE. | |||
/* Functions */ | |||
int is_parsable(PyObject*); | |||
int is_single(PyObject*); | |||
int is_single_only(PyObject*); | |||
int is_scheme(PyObject*, int); | |||
int is_parsable(PyObject *); | |||
int is_single(PyObject *); | |||
int is_single_only(PyObject *); | |||
int is_scheme(PyObject *, int); | |||
/* Macros */ | |||
#define GET_HTML_TAG(markup) \ | |||
(markup == ':' ? "dd" : markup == ';' ? "dt" : "li") | |||
#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") |
@@ -26,13 +26,14 @@ SOFTWARE. | |||
/* | |||
Initialize a new TagData object. | |||
*/ | |||
TagData* TagData_new(TokenizerInput* text) | |||
TagData * | |||
TagData_new(TokenizerInput *text) | |||
{ | |||
#define ALLOC_BUFFER(name) \ | |||
name = Textbuffer_new(text); \ | |||
if (!name) { \ | |||
TagData_dealloc(self); \ | |||
return NULL; \ | |||
#define ALLOC_BUFFER(name) \ | |||
name = Textbuffer_new(text); \ | |||
if (!name) { \ | |||
TagData_dealloc(self); \ | |||
return NULL; \ | |||
} | |||
TagData *self = malloc(sizeof(TagData)); | |||
@@ -54,25 +55,30 @@ TagData* TagData_new(TokenizerInput* text) | |||
/* | |||
Deallocate the given TagData object. | |||
*/ | |||
void TagData_dealloc(TagData* self) | |||
void | |||
TagData_dealloc(TagData *self) | |||
{ | |||
if (self->pad_first) | |||
if (self->pad_first) { | |||
Textbuffer_dealloc(self->pad_first); | |||
if (self->pad_before_eq) | |||
} | |||
if (self->pad_before_eq) { | |||
Textbuffer_dealloc(self->pad_before_eq); | |||
if (self->pad_after_eq) | |||
} | |||
if (self->pad_after_eq) { | |||
Textbuffer_dealloc(self->pad_after_eq); | |||
} | |||
free(self); | |||
} | |||
/* | |||
Clear the internal buffers of the given TagData object. | |||
*/ | |||
int TagData_reset_buffers(TagData* self) | |||
int | |||
TagData_reset_buffers(TagData *self) | |||
{ | |||
if (Textbuffer_reset(self->pad_first) || | |||
Textbuffer_reset(self->pad_before_eq) || | |||
Textbuffer_reset(self->pad_after_eq)) | |||
if (Textbuffer_reset(self->pad_first) || Textbuffer_reset(self->pad_before_eq) || | |||
Textbuffer_reset(self->pad_after_eq)) { | |||
return -1; | |||
} | |||
return 0; | |||
} |
@@ -29,15 +29,15 @@ SOFTWARE. | |||
typedef struct { | |||
uint64_t context; | |||
Textbuffer* pad_first; | |||
Textbuffer* pad_before_eq; | |||
Textbuffer* pad_after_eq; | |||
Textbuffer *pad_first; | |||
Textbuffer *pad_before_eq; | |||
Textbuffer *pad_after_eq; | |||
Py_UCS4 quoter; | |||
Py_ssize_t reset; | |||
} TagData; | |||
/* Functions */ | |||
TagData* TagData_new(TokenizerInput*); | |||
void TagData_dealloc(TagData*); | |||
int TagData_reset_buffers(TagData*); | |||
TagData *TagData_new(TokenizerInput *); | |||
void TagData_dealloc(TagData *); | |||
int TagData_reset_buffers(TagData *); |
@@ -23,20 +23,22 @@ SOFTWARE. | |||
#include "textbuffer.h" | |||
#define INITIAL_CAPACITY 32 | |||
#define RESIZE_FACTOR 2 | |||
#define CONCAT_EXTRA 32 | |||
#define RESIZE_FACTOR 2 | |||
#define CONCAT_EXTRA 32 | |||
/* | |||
Internal allocation function for textbuffers. | |||
*/ | |||
static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar) | |||
static int | |||
internal_alloc(Textbuffer *self, Py_UCS4 maxchar) | |||
{ | |||
self->capacity = INITIAL_CAPACITY; | |||
self->length = 0; | |||
self->object = PyUnicode_New(self->capacity, maxchar); | |||
if (!self->object) | |||
if (!self->object) { | |||
return -1; | |||
} | |||
self->kind = PyUnicode_KIND(self->object); | |||
self->data = PyUnicode_DATA(self->object); | |||
@@ -46,7 +48,8 @@ static int internal_alloc(Textbuffer* self, Py_UCS4 maxchar) | |||
/* | |||
Internal deallocation function for textbuffers. | |||
*/ | |||
static void internal_dealloc(Textbuffer* self) | |||
static void | |||
internal_dealloc(Textbuffer *self) | |||
{ | |||
Py_DECREF(self->object); | |||
} | |||
@@ -54,14 +57,16 @@ static void internal_dealloc(Textbuffer* self) | |||
/* | |||
Internal resize function. | |||
*/ | |||
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) | |||
static int | |||
internal_resize(Textbuffer *self, Py_ssize_t new_cap) | |||
{ | |||
PyObject *newobj; | |||
void *newdata; | |||
newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object)); | |||
if (!newobj) | |||
if (!newobj) { | |||
return -1; | |||
} | |||
newdata = PyUnicode_DATA(newobj); | |||
memcpy(newdata, self->data, self->length * self->kind); | |||
Py_DECREF(self->object); | |||
@@ -75,22 +80,25 @@ static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) | |||
/* | |||
Create a new textbuffer object. | |||
*/ | |||
Textbuffer* Textbuffer_new(TokenizerInput* text) | |||
Textbuffer * | |||
Textbuffer_new(TokenizerInput *text) | |||
{ | |||
Textbuffer* self = malloc(sizeof(Textbuffer)); | |||
Textbuffer *self = malloc(sizeof(Textbuffer)); | |||
Py_UCS4 maxchar = 0; | |||
maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); | |||
if (!self) | |||
if (!self) { | |||
goto fail_nomem; | |||
if (internal_alloc(self, maxchar) < 0) | |||
} | |||
if (internal_alloc(self, maxchar) < 0) { | |||
goto fail_dealloc; | |||
} | |||
return self; | |||
fail_dealloc: | |||
fail_dealloc: | |||
free(self); | |||
fail_nomem: | |||
fail_nomem: | |||
PyErr_NoMemory(); | |||
return NULL; | |||
} | |||
@@ -98,7 +106,8 @@ Textbuffer* Textbuffer_new(TokenizerInput* text) | |||
/* | |||
Deallocate the given textbuffer. | |||
*/ | |||
void Textbuffer_dealloc(Textbuffer* self) | |||
void | |||
Textbuffer_dealloc(Textbuffer *self) | |||
{ | |||
internal_dealloc(self); | |||
free(self); | |||
@@ -107,26 +116,30 @@ void Textbuffer_dealloc(Textbuffer* self) | |||
/* | |||
Reset a textbuffer to its initial, empty state. | |||
*/ | |||
int Textbuffer_reset(Textbuffer* self) | |||
int | |||
Textbuffer_reset(Textbuffer *self) | |||
{ | |||
Py_UCS4 maxchar = 0; | |||
maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); | |||
internal_dealloc(self); | |||
if (internal_alloc(self, maxchar)) | |||
if (internal_alloc(self, maxchar)) { | |||
return -1; | |||
} | |||
return 0; | |||
} | |||
/* | |||
Write a Unicode codepoint to the given textbuffer. | |||