From 237798a17eee319a713664f137dabc42a0362bdf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 21 Dec 2020 01:59:14 -0500 Subject: [PATCH] Update tag definitions --- mwparserfromhell/definitions.py | 76 ++++++++++++++++++------ mwparserfromhell/parser/ctokenizer/definitions.c | 72 ++++++++++++++++++---- 2 files changed, 120 insertions(+), 28 deletions(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 6191dc6..0e70cc1 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -27,35 +27,77 @@ When updating this file, please also update the the C tokenizer version: - mwparserfromhell/parser/ctokenizer/definitions.h """ - __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", "is_single_only", "is_scheme"] URI_SCHEMES = { - # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 - "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, - "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, - "sips": False, "gopher": True, "telnet": True, "nntp": True, - "worldwind": True, "mailto": False, "tel": False, "sms": False, - "news": False, "svn": True, "git": True, "mms": True, "bitcoin": False, - "magnet": False, "urn": False, "geo": False + # [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0 + "bitcoin": False, + "ftp": True, + "ftps": True, + "geo": False, + "git": True, + "gopher": True, + "http": True, + "https": True, + "irc": True, + "ircs": True, + "magnet": False, + "mailto": False, + "mms": True, + "news": False, + "nntp": True, + "redis": True, + "sftp": True, + "sip": False, + "sips": False, + "sms": False, + "ssh": True, + "svn": True, + "tel": False, + "telnet": True, + "urn": False, + "worldwind": True, + "xmpp": False, } PARSER_BLACKLIST = [ - # enwiki extensions @ 2013-06-28 - "categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", - "nowiki", "pre", "score", "section", "source", "syntaxhighlight", - "templatedata", "timeline" + # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21 + "categorytree", + "ce", + "chem", + "gallery", + "graph", + "hiero", + "imagemap", + "inputbox", + "math", + "nowiki", + "pre", + "score", + "section", + "source", + "syntaxhighlight", + "templatedata", + "timeline", ] INVISIBLE_TAGS = [ - # enwiki extensions @ 2013-06-28 - "categorytree", "gallery", "imagemap", "inputbox", "math", "score", - "section", "templatedata", "timeline" + # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21 + "categorytree", + "gallery", + "graph", + "imagemap", + "inputbox", + "math", + "score", + "section", + "templatedata", + "timeline" ] -# [mediawiki/core.git]/includes/Sanitizer.php @ 065bec63ea -SINGLE_ONLY = ["br", "hr", "meta", "link", "img", "wbr"] +# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645 +SINGLE_ONLY = ["br", "wbr", "hr", "meta", "link", "img"] SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] MARKUP_TO_HTML = { diff --git a/mwparserfromhell/parser/ctokenizer/definitions.c b/mwparserfromhell/parser/ctokenizer/definitions.c index e247234..b1ff278 100644 --- a/mwparserfromhell/parser/ctokenizer/definitions.c +++ b/mwparserfromhell/parser/ctokenizer/definitions.c @@ -28,29 +28,79 @@ SOFTWARE. */ static const char* URI_SCHEMES[] = { - "http", "https", "ftp", "ftps", "ssh", "sftp", "irc", "ircs", "xmpp", - "sip", "sips", "gopher", "telnet", "nntp", "worldwind", "mailto", "tel", - "sms", "news", "svn", "git", "mms", "bitcoin", "magnet", "urn", "geo", NULL + "bitcoin", + "ftp", + "ftps", + "geo", + "git", + "gopher", + "http", + "https", + "irc", + "ircs", + "magnet", + "mailto", + "mms", + "news", + "nntp", + "redis", + "sftp", + "sip", + "sips", + "sms", + "ssh", + "svn", + "tel", + "telnet", + "urn", + "worldwind", + "xmpp", + NULL, }; static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = { - "xmpp", "sip", "sips", "mailto", "tel", "sms", "news", "bitcoin", "magnet", - "urn", "geo", NULL + "bitcoin", + "geo", + "magnet", + "mailto", + "news", + "sip", + "sips", + "sms", + "tel", + "urn", + "xmpp", + NULL, }; static const char* PARSER_BLACKLIST[] = { - "categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", - "nowiki", "pre", "score", "section", "source", "syntaxhighlight", - "templatedata", "timeline", NULL + "categorytree", + "ce", + "chem", + "gallery", + "graph", + "hiero", + "imagemap", + "inputbox", + "math", + "nowiki", + "pre", + "score", + "section", + "source", + "syntaxhighlight", + "templatedata", + "timeline", + NULL, }; static const char* SINGLE[] = { - "br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", - "wbr", NULL + "br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", + "tr", NULL }; static const char* SINGLE_ONLY[] = { - "br", "hr", "meta", "link", "img", "wbr", NULL + "br", "wbr", "hr", "meta", "link", "img", NULL }; /*