Browse Source

Update tag definitions

tags/v0.6
Ben Kurtovic 3 years ago
parent
commit
237798a17e
2 changed files with 120 additions and 28 deletions
  1. +59
    -17
      mwparserfromhell/definitions.py
  2. +61
    -11
      mwparserfromhell/parser/ctokenizer/definitions.c

+ 59
- 17
mwparserfromhell/definitions.py View File

@@ -27,35 +27,77 @@ When updating this file, please also update the the C tokenizer version:
- mwparserfromhell/parser/ctokenizer/definitions.h
"""


__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only", "is_scheme"]

URI_SCHEMES = {
# [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943
"http": True, "https": True, "ftp": True, "ftps": True, "ssh": True,
"sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False,
"sips": False, "gopher": True, "telnet": True, "nntp": True,
"worldwind": True, "mailto": False, "tel": False, "sms": False,
"news": False, "svn": True, "git": True, "mms": True, "bitcoin": False,
"magnet": False, "urn": False, "geo": False
# [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0
"bitcoin": False,
"ftp": True,
"ftps": True,
"geo": False,
"git": True,
"gopher": True,
"http": True,
"https": True,
"irc": True,
"ircs": True,
"magnet": False,
"mailto": False,
"mms": True,
"news": False,
"nntp": True,
"redis": True,
"sftp": True,
"sip": False,
"sips": False,
"sms": False,
"ssh": True,
"svn": True,
"tel": False,
"telnet": True,
"urn": False,
"worldwind": True,
"xmpp": False,
}

PARSER_BLACKLIST = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
"nowiki", "pre", "score", "section", "source", "syntaxhighlight",
"templatedata", "timeline"
# https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
"categorytree",
"ce",
"chem",
"gallery",
"graph",
"hiero",
"imagemap",
"inputbox",
"math",
"nowiki",
"pre",
"score",
"section",
"source",
"syntaxhighlight",
"templatedata",
"timeline",
]

INVISIBLE_TAGS = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "imagemap", "inputbox", "math", "score",
"section", "templatedata", "timeline"
# https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
"categorytree",
"gallery",
"graph",
"imagemap",
"inputbox",
"math",
"score",
"section",
"templatedata",
"timeline"
]

# [mediawiki/core.git]/includes/Sanitizer.php @ 065bec63ea
SINGLE_ONLY = ["br", "hr", "meta", "link", "img", "wbr"]
# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645
SINGLE_ONLY = ["br", "wbr", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]

MARKUP_TO_HTML = {


+ 61
- 11
mwparserfromhell/parser/ctokenizer/definitions.c View File

@@ -28,29 +28,79 @@ SOFTWARE.
*/

static const char* URI_SCHEMES[] = {
"http", "https", "ftp", "ftps", "ssh", "sftp", "irc", "ircs", "xmpp",
"sip", "sips", "gopher", "telnet", "nntp", "worldwind", "mailto", "tel",
"sms", "news", "svn", "git", "mms", "bitcoin", "magnet", "urn", "geo", NULL
"bitcoin",
"ftp",
"ftps",
"geo",
"git",
"gopher",
"http",
"https",
"irc",
"ircs",
"magnet",
"mailto",
"mms",
"news",
"nntp",
"redis",
"sftp",
"sip",
"sips",
"sms",
"ssh",
"svn",
"tel",
"telnet",
"urn",
"worldwind",
"xmpp",
NULL,
};

static const char* URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
"xmpp", "sip", "sips", "mailto", "tel", "sms", "news", "bitcoin", "magnet",
"urn", "geo", NULL
"bitcoin",
"geo",
"magnet",
"mailto",
"news",
"sip",
"sips",
"sms",
"tel",
"urn",
"xmpp",
NULL,
};

static const char* PARSER_BLACKLIST[] = {
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
"nowiki", "pre", "score", "section", "source", "syntaxhighlight",
"templatedata", "timeline", NULL
"categorytree",
"ce",
"chem",
"gallery",
"graph",
"hiero",
"imagemap",
"inputbox",
"math",
"nowiki",
"pre",
"score",
"section",
"source",
"syntaxhighlight",
"templatedata",
"timeline",
NULL,
};

static const char* SINGLE[] = {
"br", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr",
"wbr", NULL
"br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td",
"tr", NULL
};

static const char* SINGLE_ONLY[] = {
"br", "hr", "meta", "link", "img", "wbr", NULL
"br", "wbr", "hr", "meta", "link", "img", NULL
};

/*


Loading…
Cancel
Save