From de6d5074fa37af6944764a3ffb09e94e27eb0842 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 30 Jul 2013 15:37:06 -0400 Subject: [PATCH 01/27] Tag.showtag -> Tag.wiki_markup --- mwparserfromhell/nodes/tag.py | 26 +++++++++++++------------- mwparserfromhell/parser/builder.py | 8 +++----- mwparserfromhell/parser/tokens.py | 6 +----- mwparserfromhell/tag_defs.py | 10 +++++----- tests/_test_tree_equality.py | 2 +- tests/test_tag.py | 20 ++++++++++---------- tests/test_tokens.py | 6 +++--- 7 files changed, 36 insertions(+), 42 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 08d5204..25c0708 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -24,7 +24,7 @@ from __future__ import unicode_literals from . import Node, Text from ..compat import str -from ..tag_defs import get_wikicode, is_visible +from ..tag_defs import get_wiki_markup, is_visible from ..utils import parse_anything __all__ = ["Tag"] @@ -32,7 +32,7 @@ __all__ = ["Tag"] class Tag(Node): """Represents an HTML-style tag in wikicode, like ````.""" - def __init__(self, tag, contents=None, attrs=None, showtag=True, + def __init__(self, tag, contents=None, attrs=None, wiki_markup=False, self_closing=False, invalid=False, implicit=False, padding="", closing_tag=None): super(Tag, self).__init__() @@ -42,7 +42,7 @@ class Tag(Node): else: self._contents = contents self._attrs = attrs if attrs else [] - self._showtag = showtag + self._wiki_markup = wiki_markup self._self_closing = self_closing self._invalid = invalid self._implicit = implicit @@ -53,8 +53,8 @@ class Tag(Node): self._closing_tag = tag def __unicode__(self): - if not self.showtag: - open_, close = get_wikicode(self.tag) + if self.wiki_markup: + open_, close = get_wiki_markup(self.tag) if self.self_closing: return open_ else: @@ -72,7 +72,7 @@ class Tag(Node): def __iternodes__(self, getter): yield None, self - if self.showtag: + if not self.wiki_markup: for child in getter(self.tag): yield self.tag, child for attr in self.attributes: @@ -84,7 +84,7 @@ class Tag(Node): if self.contents: for child in getter(self.contents): yield self.contents, child - if not self.self_closing and self.showtag and self.closing_tag: + if not self.self_closing and not self.wiki_markup and self.closing_tag: for child in getter(self.closing_tag): yield self.closing_tag, child @@ -131,9 +131,9 @@ class Tag(Node): return self._attrs @property - def showtag(self): - """Whether to show the tag itself instead of a wikicode version.""" - return self._showtag + def wiki_markup(self): + """Whether to show the wiki version of a tag instead of the HTML.""" + return self._wiki_markup @property def self_closing(self): @@ -183,9 +183,9 @@ class Tag(Node): def contents(self, value): self._contents = parse_anything(value) - @showtag.setter - def showtag(self, value): - self._showtag = bool(value) + @wiki_markup.setter + def wiki_markup(self, value): + self._wiki_markup = bool(value) @self_closing.setter def self_closing(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 9366742..ef55776 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -207,8 +207,7 @@ class Builder(object): """Handle a case where a tag is at the head of the tokens.""" close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) implicit, attrs, contents, closing_tag = False, [], None, None - showtag = token.get("showtag", True) - invalid = token.get("invalid", False) + wiki_markup, invalid = token.wiki_markup, token.invalid self._push() while self._tokens: token = self._tokens.pop() @@ -225,12 +224,11 @@ class Builder(object): if isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() self_closing = True - padding = token.padding - implicit = token.get("implicit", False) + padding, implicit = token.padding, token.implicit else: self_closing = False closing_tag = self._pop() - return Tag(tag, contents, attrs, showtag, self_closing, + return Tag(tag, contents, attrs, wiki_markup, self_closing, invalid, implicit, padding, closing_tag) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index f3d89fc..6dd3446 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -55,7 +55,7 @@ class Token(object): return False def __getattr__(self, key): - return self._kwargs[key] + return self._kwargs.get(key, False) def __setattr__(self, key, value): self._kwargs[key] = value @@ -63,10 +63,6 @@ class Token(object): def __delattr__(self, key): del self._kwargs[key] - def get(self, key, default=None): - """Same as :py:meth:`__getattr__`, but has a *default* if missing.""" - return self._kwargs.get(key, default) - def make(name): """Create a new Token class using ``type()`` and add it to ``__all__``.""" diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index 73493d3..c918b4d 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -24,7 +24,7 @@ from __future__ import unicode_literals -__all__ = ["get_wikicode", "is_parsable", "is_visible", "is_single", +__all__ = ["get_wiki_markup", "is_parsable", "is_visible", "is_single", "is_single_only"] PARSER_BLACKLIST = [ @@ -44,7 +44,7 @@ INVISIBLE_TAGS = [ SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] -WIKICODE = { +WIKI_MARKUP = { "i": {"open": "''", "close": "''"}, "b": {"open": "'''", "close": "'''"}, "ul": {"open": "*"}, @@ -54,9 +54,9 @@ WIKICODE = { "hr": {"open": "----"}, } -def get_wikicode(tag): - """Return the appropriate wikicode before and after the given *tag*.""" - data = WIKICODE[tag.lower()] +def get_wiki_markup(tag): + """Return the appropriate wiki markup before and after the given *tag*.""" + data = WIKI_MARKUP[tag.lower()] return (data.get("open"), data.get("close")) def is_parsable(tag): diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py index cfda97b..3267b45 100644 --- a/tests/_test_tree_equality.py +++ b/tests/_test_tree_equality.py @@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase): self.assertEqual(exp_attr.pad_first, act_attr.pad_first) self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq) self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq) - self.assertIs(expected.showtag, actual.showtag) + self.assertIs(expected.wiki_markup, actual.wiki_markup) self.assertIs(expected.self_closing, actual.self_closing) self.assertIs(expected.invalid, actual.invalid) self.assertIs(expected.implicit, actual.implicit) diff --git a/tests/test_tag.py b/tests/test_tag.py index 6755270..a0fbcf1 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -50,8 +50,8 @@ class TestTag(TreeEqualityTestCase): implicit=True) node7 = Tag(wraptext("br"), self_closing=True, invalid=True, padding=" ") - node8 = Tag(wraptext("hr"), showtag=False, self_closing=True) - node9 = Tag(wraptext("i"), wraptext("italics!"), showtag=False) + node8 = Tag(wraptext("hr"), wiki_markup=True, self_closing=True) + node9 = Tag(wraptext("i"), wraptext("italics!"), wiki_markup=True) self.assertEqual("", str(node1)) self.assertEqual('foo', str(node2)) @@ -72,7 +72,7 @@ class TestTag(TreeEqualityTestCase): # foobar node1 = Tag(wrap([node1n1]), wrap([node1n2])) # '''bold text''' - node2 = Tag(wraptext("i"), wrap([node2n1]), showtag=False) + node2 = Tag(wraptext("i"), wrap([node2n1]), wiki_markup=True) # node3 = Tag(wrap([node3n1]), attrs=[Attribute(wrap([node3n2]), wrap([node3n3])), @@ -156,15 +156,15 @@ class TestTag(TreeEqualityTestCase): self.assertEqual([], node1.attributes) self.assertIs(attrs, node2.attributes) - def test_showtag(self): - """test getter/setter for the showtag attribute""" + def test_wiki_markup(self): + """test getter/setter for the wiki_markup attribute""" node = Tag(wraptext("i"), wraptext("italic text")) - self.assertTrue(node.showtag) - node.showtag = False - self.assertFalse(node.showtag) + self.assertFalse(node.wiki_markup) + node.wiki_markup = True + self.assertTrue(node.wiki_markup) self.assertEqual("''italic text''", node) - node.showtag = 1 - self.assertTrue(node.showtag) + node.wiki_markup = 0 + self.assertFalse(node.wiki_markup) self.assertEqual("italic text", node) def test_self_closing(self): diff --git a/tests/test_tokens.py b/tests/test_tokens.py index 4620982..2048bb9 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -44,8 +44,8 @@ class TestTokens(unittest.TestCase): self.assertEqual("bar", token2.foo) self.assertEqual(123, token2.baz) - self.assertRaises(KeyError, lambda: token1.foo) - self.assertRaises(KeyError, lambda: token2.bar) + self.assertFalse(token1.foo) + self.assertFalse(token2.bar) token1.spam = "eggs" token2.foo = "ham" @@ -53,7 +53,7 @@ class TestTokens(unittest.TestCase): self.assertEqual("eggs", token1.spam) self.assertEqual("ham", token2.foo) - self.assertRaises(KeyError, lambda: token2.baz) + self.assertFalse(token2.baz) self.assertRaises(KeyError, delattr, token2, "baz") def test_repr(self): From 4f52887e172d5e6b8020f7e3abfa91d427e99dca Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 2 Aug 2013 22:11:46 -0400 Subject: [PATCH 02/27] wiki_markup stores the markup itself instead of a bool --- mwparserfromhell/nodes/tag.py | 18 +++++++++++------- mwparserfromhell/parser/builder.py | 7 ++++--- mwparserfromhell/parser/tokens.py | 2 +- mwparserfromhell/tag_defs.py | 18 +----------------- tests/test_tag.py | 14 +++++++------- 5 files changed, 24 insertions(+), 35 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 25c0708..02b87b4 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -24,7 +24,7 @@ from __future__ import unicode_literals from . import Node, Text from ..compat import str -from ..tag_defs import get_wiki_markup, is_visible +from ..tag_defs import is_visible from ..utils import parse_anything __all__ = ["Tag"] @@ -32,7 +32,7 @@ __all__ = ["Tag"] class Tag(Node): """Represents an HTML-style tag in wikicode, like ````.""" - def __init__(self, tag, contents=None, attrs=None, wiki_markup=False, + def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, self_closing=False, invalid=False, implicit=False, padding="", closing_tag=None): super(Tag, self).__init__() @@ -54,11 +54,10 @@ class Tag(Node): def __unicode__(self): if self.wiki_markup: - open_, close = get_wiki_markup(self.tag) if self.self_closing: - return open_ + return self.wiki_markup else: - return open_ + str(self.contents) + close + return self.wiki_markup + str(self.contents) + self.wiki_markup result = ("`` or ``----`` to replace + ``
``. + """ return self._wiki_markup @property @@ -185,7 +189,7 @@ class Tag(Node): @wiki_markup.setter def wiki_markup(self, value): - self._wiki_markup = bool(value) + self._wiki_markup = str(value) if value else None @self_closing.setter def self_closing(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index ef55776..196ef14 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -207,14 +207,14 @@ class Builder(object): """Handle a case where a tag is at the head of the tokens.""" close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) implicit, attrs, contents, closing_tag = False, [], None, None - wiki_markup, invalid = token.wiki_markup, token.invalid + wiki_markup, invalid = token.wiki_markup, token.invalid or False self._push() while self._tokens: token = self._tokens.pop() if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute(token)) elif isinstance(token, tokens.TagCloseOpen): - padding = token.padding + padding = token.padding or "" tag = self._pop() self._push() elif isinstance(token, tokens.TagOpenClose): @@ -224,7 +224,8 @@ class Builder(object): if isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() self_closing = True - padding, implicit = token.padding, token.implicit + padding = token.padding or "" + implicit = token.implicit or False else: self_closing = False closing_tag = self._pop() diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 6dd3446..8c2ea87 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -55,7 +55,7 @@ class Token(object): return False def __getattr__(self, key): - return self._kwargs.get(key, False) + return self._kwargs.get(key) def __setattr__(self, key, value): self._kwargs[key] = value diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index c918b4d..94e0ac4 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -24,8 +24,7 @@ from __future__ import unicode_literals -__all__ = ["get_wiki_markup", "is_parsable", "is_visible", "is_single", - "is_single_only"] +__all__ = ["is_parsable", "is_visible", "is_single", "is_single_only"] PARSER_BLACKLIST = [ # enwiki extensions @ 2013-06-28 @@ -44,21 +43,6 @@ INVISIBLE_TAGS = [ SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] -WIKI_MARKUP = { - "i": {"open": "''", "close": "''"}, - "b": {"open": "'''", "close": "'''"}, - "ul": {"open": "*"}, - "ol": {"open": "#"}, - "dt": {"open": ";"}, - "dd": {"open": ":"}, - "hr": {"open": "----"}, -} - -def get_wiki_markup(tag): - """Return the appropriate wiki markup before and after the given *tag*.""" - data = WIKI_MARKUP[tag.lower()] - return (data.get("open"), data.get("close")) - def is_parsable(tag): """Return if the given *tag*'s contents should be passed to the parser.""" return tag.lower() not in PARSER_BLACKLIST diff --git a/tests/test_tag.py b/tests/test_tag.py index a0fbcf1..7ffce35 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -50,8 +50,8 @@ class TestTag(TreeEqualityTestCase): implicit=True) node7 = Tag(wraptext("br"), self_closing=True, invalid=True, padding=" ") - node8 = Tag(wraptext("hr"), wiki_markup=True, self_closing=True) - node9 = Tag(wraptext("i"), wraptext("italics!"), wiki_markup=True) + node8 = Tag(wraptext("hr"), wiki_markup="----", self_closing=True) + node9 = Tag(wraptext("i"), wraptext("italics!"), wiki_markup="''") self.assertEqual("", str(node1)) self.assertEqual('foo', str(node2)) @@ -72,7 +72,7 @@ class TestTag(TreeEqualityTestCase): # foobar node1 = Tag(wrap([node1n1]), wrap([node1n2])) # '''bold text''' - node2 = Tag(wraptext("i"), wrap([node2n1]), wiki_markup=True) + node2 = Tag(wraptext("b"), wrap([node2n1]), wiki_markup="'''") # node3 = Tag(wrap([node3n1]), attrs=[Attribute(wrap([node3n2]), wrap([node3n3])), @@ -159,11 +159,11 @@ class TestTag(TreeEqualityTestCase): def test_wiki_markup(self): """test getter/setter for the wiki_markup attribute""" node = Tag(wraptext("i"), wraptext("italic text")) - self.assertFalse(node.wiki_markup) - node.wiki_markup = True - self.assertTrue(node.wiki_markup) + self.assertIs(None, node.wiki_markup) + node.wiki_markup = "''" + self.assertEqual("''", node.wiki_markup) self.assertEqual("''italic text''", node) - node.wiki_markup = 0 + node.wiki_markup = False self.assertFalse(node.wiki_markup) self.assertEqual("italic text", node) From f70188daa882a91459382ff259daca9ffa628abd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 2 Aug 2013 22:35:04 -0400 Subject: [PATCH 03/27] Add builder and tokenizer tests for wiki-markup tags. --- tests/test_builder.py | 14 +++++++++++ tests/tokenizer/tags_wikimarkup.mwtest | 46 ++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 tests/tokenizer/tags_wikimarkup.mwtest diff --git a/tests/test_builder.py b/tests/test_builder.py index c987820..29ae65a 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -303,6 +303,20 @@ class TestBuilder(TreeEqualityTestCase): Text(" "), Wikilink(wraptext("q")), Text(" "), Template(wraptext("r"))]), True, " \n ", " ", " ")])])), + + # "''italic text''" + ([tokens.TagOpenOpen(wiki_markup="''"), tokens.Text(text="i"), + tokens.TagCloseOpen(), tokens.Text(text="italic text"), + tokens.TagOpenClose(), tokens.Text(text="i"), + tokens.TagCloseClose()], + wrap([Tag(wraptext("i"), wraptext("italic text"), + wiki_markup="''")])), + + # * bullet + ([tokens.TagOpenOpen(wiki_markup="*"), tokens.Text(text="li"), + tokens.TagCloseSelfclose(), tokens.Text(text=" bullet")], + wrap([Tag(wraptext("li"), wiki_markup="*", self_closing=True), + Text(" bullet")])), ] for test, valid in tests: self.assertWikicodeEqual(valid, self.builder.build(test)) diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest new file mode 100644 index 0000000..7dc211e --- /dev/null +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -0,0 +1,46 @@ +name: basic_italics +label: basic italic text +input: "''text''" +output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="i"), TagCloseClose()] + +--- + +name: basic_bold +label: basic bold text +input: "'''text'''" +output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose()] + +--- + +name: basic_ul +label: basic unordered list +input: "*text" +output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="text")] + +--- + +name: basic_ol +label: basic ordered list +input: "#text" +output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="text")] + +--- + +name: basic_dt +label: basic description term +input: ";text" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="text")] + +--- + +name: basic_dd +label: basic description item +input: ":text" +output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="text")] + +--- + +name: basic_hr +label: basic horizontal rule +input: "----" +output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose()] From 81bafdb72fbd35c289a734fbd2fe54e91fae803e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 7 Aug 2013 21:50:38 -0400 Subject: [PATCH 04/27] Add 56 tokenizer tests for wiki-markup tags. --- tests/tokenizer/tags_wikimarkup.mwtest | 393 +++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 7dc211e..156f325 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -44,3 +44,396 @@ name: basic_hr label: basic horizontal rule input: "----" output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose()] + +--- + +name: complex_italics +label: italics with a lot in them +input: "''this is a test of [[Italic text|italics]] with {{plenty|of|stuff}}''" +output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of "), WikilinkOpen(), Text(text="Italic text"), WikilinkSeparator(), Text(text="italics"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] + +--- + +name: multiline_italics +label: italics spanning mulitple lines +input: "foo\nbar''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines''foo\n\nbar" +output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="i"), TagCloseClose()] + +--- + +name: unending_italics +label: italics without an ending tag +input: "''unending formatting!" +output: [Text(text="''unending formatting!")] + +--- + +name: misleading_italics_end +label: italics with something that looks like an end but isn't +input: "''this is 'not' the en'd'''" +output: [Text(text="''this is 'not' the en'd'"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="''"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] +] + +--- + +name: italics_start_outside_end_inside +label: italics that start outside a link and end inside it +input: "''foo[[bar|baz'']]spam" +output: [] + +--- + +name: italics_start_inside_end_outside +label: italics that start inside a link and end outside it +input: "[[foo|''bar]]baz''spam" +output: [] + +--- + +name: complex_bold +label: bold with a lot in it +input: "'''this is a test of [[Bold text|bold]] with {{plenty|of|stuff}}'''" +output: [] + +--- + +name: multiline_bold +label: bold spanning mulitple lines +input: "foo\nbar'''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines'''foo\n\nbar" +output: [] + +--- + +name: unending_bold +label: bold without an ending tag +input: "'''unending formatting!" +output: [Text(text="'''unending formatting!")] + +--- + +name: misleading_bold_end +label: bold with something that looks like an end but isn't +input: "'''this is 'not' the en''d'''" +output: [Text(text="'''this is 'not' the en''d'"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="''"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] + +--- + +name: bold_start_outside_end_inside +label: bold that start outside a link and end inside it +input: "'''foo[[bar|baz''']]spam" +output: [] + +--- + +name: bold_start_inside_end_outside +label: bold that start inside a link and end outside it +input: "[[foo|'''bar]]baz'''spam" +output: [] + +--- + +name: bold_and_italics +label: bold and italics together +input: "this is '''''bold and italic text'''''!" +output: [] + +--- + +name: both_then_bold +label: text that starts bold/italic, then is just bold +input: "'''''both''bold'''" +output: [] + +--- + +name: both_then_italics +label: text that starts bold/italic, then is just italic +input: "'''''both'''italics''" +output: [] + +--- + +name: bold_then_both +label: text that starts just bold, then is bold/italic +input: "'''bold''both'''''" +output: [] + +--- + +name: italics_then_both +label: text that starts just italic, then is bold/italic +input: "''italics'''both'''''" +output: [] + +--- + +name: seven +label: seven ticks +input: "'''''''seven'''''''" +output: [] + +--- + +name: complex_ul +label: ul with a lot in it +input: "* this is a test of an [[Unordered list|ul]] with {{plenty|of|stuff}}" +output: [] + +--- + +name: ul_multiline_template +label: ul with a template that spans multiple lines +input: "* this has a template with a {{line|\nbreak}}\nthis is not part of the list" +output: [] + +--- + +name: ul_adjacent +label: multiple adjacent uls +input: "a\n*b\n*c\nd\n*e\nf" +output: [] + +--- + +name: ul_depths +label: multiple adjacent uls, with differing depths +input: "*a\n**b\n***c\n********d\n**e\nf\n***g" +output: [] + +--- + +name: ul_space_before +label: uls with space before them +input: "foo *bar\n *baz\n*buzz" +output: [] + +--- + +name: ul_interruption +label: high-depth ul with something blocking it +input: "**f*oobar" +output: [] + +--- + +name: complex_ol +label: ol with a lot in it +input: "# this is a test of an [[Ordered list|ol]] with {{plenty|of|stuff}}" +output: [] + +--- + +name: ol_multiline_template +label: ol with a template that spans moltiple lines +input: "# this has a template with a {{line|\nbreak}}\nthis is not part of the list" +output: [] + +--- + +name: ol_adjacent +label: moltiple adjacent ols +input: "a\n#b\n#c\nd\n#e\nf" +output: [] + +--- + +name: ol_depths +label: moltiple adjacent ols, with differing depths +input: "#a\n##b\n###c\n########d\n##e\nf\n###g" +output: [] + +--- + +name: ol_space_before +label: ols with space before them +input: "foo #bar\n #baz\n#buzz" +output: [] + +--- + +name: ol_interruption +label: high-depth ol with something blocking it +input: "##f#oobar" +output: [] + +--- + +name: ul_ol_mix +label: a mix of adjacent uls and ols +input: "*a\n*#b\n*##c\n*##*#*#*d\n*#e\nf\n##*g" +output: [] + +--- + +name: complex_dt +label: dt with a lot in it +input: "; this is a test of an [[description term|dt]] with {{plenty|of|stuff}}" +output: [] + +--- + +name: dt_multiline_template +label: dt with a template that spans mdttiple lines +input: "; this has a template with a {{line|\nbreak}}\nthis is not part of the list" +output: [] + +--- + +name: dt_adjacent +label: mdttiple adjacent dts +input: ";\n;b\n;c\nd\n;e\nf" +output: [] + +--- + +name: dt_depths +label: mdttiple adjacent dts, with differing depths +input: ";a\n;;b\n;;;c\n;;;;;;;;d\n;;e\nf\n;;;g" +output: [] + +--- + +name: dt_space_before +label: dts with space before them +input: "foo ;bar\n ;baz\n;buzz" +output: [] + +--- + +name: dt_interruption +label: high-depth dt with something blocking it +input: ";;f;oobar" +output: [] + +--- + +name: complex_dd +label: dd with a lot in it +input: ": this is a :test of an [[description item|dd]] with {{plenty|of|stuff}}" +output: [] + +--- + +name: dd_multiline_template +label: dd with a template that spans mddtiple lines +input: ": this has a template with a {{line|\nbreak}}\nthis is not part of the list" +output: [] + +--- + +name: dd_adjacent +label: mddtiple adjacent dds +input: ":\n:b\n:c\nd\n:e\nf" +output: [] + +--- + +name: dd_depths +label: mddtiple adjacent dds, with differing depths +input: ":a\n::b\n:::c\n::::::::d\n::e\nf\n:::g" +output: [] + +--- + +name: dd_space_before +label: dds with space before them +input: "foo :bar\n :baz\n:buzz" +output: [] + +--- + +name: dd_interruption +label: high-depth dd with something blocking it +input: "::f:oobar" +output: [] + +--- + +name: dt_dd_mix +label: a mix of adjacent dts and dds +input: ";a\n;:b\n;::c\n;::;:;:;d\n;:e\nf\n::;g" +output: [] + +--- + +name: dt_dd_mix2 +label: the correct usage of a dt/dd unit, as in a dl +input: ";foo:bar" +output: [] + +--- + +name: dt_dd_mix3 +label: another complex example of dts and dds +input: ";:::;foo::;:bar;;" +output: [] + +--- + +name: hr_text_before +label: text before an otherwise-valid hr +input: "foo----" +output: [Text(text="foo----")] + +--- + +name: hr_text_after +label: text after a valid hr +input: "----bar" +output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="bar")] + +--- + +name: hr_text_before_after +label: text at both ends of an otherwise-valid hr +input: "foo----bar" +output: [Text(text="foo----bar")] + +--- + +name: hr_newlines +label: newlines surrounding a valid hr +input: "foo\n----\nbar" +output: [Text(text="foo\n"), TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\nbar")] + +--- + +name: hr_adjacent +label: two adjacent hrs +input: "----\n----" +output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\n"), TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose()] + +--- + +name: hr_adjacent_space +label: two adjacent hrs, with a space before the second one, making it invalid +input: "----\n ----" +output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\n ----")] + +--- + +name: hr_short +label: an invalid three-hyphen-long hr +input: "---" +output: [Text(text="---")] + +--- + +name: hr_long +label: a very long, valid hr +input: "------------------------------------------" +output: [TagOpenOpen(wiki_markup="------------------------------------------"), Text(text="hr"), TagCloseSelfclose()] + +--- + +name: hr_interruption_short +label: a hr that is interrupted, making it invalid +input: "---x-" +output: [Text(text="---x-")] + +--- + +name: hr_interruption_long +label: a hr that is interrupted, but the first part remains valid because it is long enough +input: "----x--" +output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="x--")] From 0de0a1f7951ac3615b548b6d78737af19d059e14 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 8 Aug 2013 03:37:33 -0400 Subject: [PATCH 05/27] Finish expected results for wiki-markup test cases. --- tests/tokenizer/tags_wikimarkup.mwtest | 84 +++++++++++++++++----------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 156f325..e1891f5 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -79,28 +79,28 @@ output: [Text(text="''this is 'not' the en'd'"), TagOpenOpen(), Text(text="nowik name: italics_start_outside_end_inside label: italics that start outside a link and end inside it input: "''foo[[bar|baz'']]spam" -output: [] +output: [Text(text="''foo"), WikilinkOpen(), Text(text="bar"), WikilinkSeparator(), Text(text="baz''"), WikilinkClose(), Text(text="spam")] --- name: italics_start_inside_end_outside label: italics that start inside a link and end outside it input: "[[foo|''bar]]baz''spam" -output: [] +output: [Text(text="[[foo|"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar]]baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="spam")] --- name: complex_bold label: bold with a lot in it input: "'''this is a test of [[Bold text|bold]] with {{plenty|of|stuff}}'''" -output: [] +output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of "), WikilinkOpen(), Text(text="Bold text"), WikilinkSeparator(), Text(text="bold"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose(), TagOpenClose(), Text(text="b"), TagCloseClose()] --- name: multiline_bold label: bold spanning mulitple lines input: "foo\nbar'''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines'''foo\n\nbar" -output: [] +output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="b"), TagCloseClose()] --- @@ -113,260 +113,260 @@ output: [Text(text="'''unending formatting!")] name: misleading_bold_end label: bold with something that looks like an end but isn't -input: "'''this is 'not' the en''d'''" -output: [Text(text="'''this is 'not' the en''d'"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="''"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] +input: "'''this is 'not' the en''d''''" +output: [Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="this is 'not' the en"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="d'"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="'''"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] --- name: bold_start_outside_end_inside label: bold that start outside a link and end inside it input: "'''foo[[bar|baz''']]spam" -output: [] +output: [Text(text="'''foo"), WikilinkOpen(), Text(text="bar"), WikilinkSeparator(), Text(text="baz'''"), WikilinkClose(), Text(text="spam")] --- name: bold_start_inside_end_outside label: bold that start inside a link and end outside it input: "[[foo|'''bar]]baz'''spam" -output: [] +output: [Text(text="[[foo|"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bar]]baz"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="spam")] --- name: bold_and_italics label: bold and italics together input: "this is '''''bold and italic text'''''!" -output: [] +output: [Text(text="this is "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold and italic text"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="!")] --- name: both_then_bold label: text that starts bold/italic, then is just bold input: "'''''both''bold'''" -output: [] +output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose()] --- name: both_then_italics label: text that starts bold/italic, then is just italic input: "'''''both'''italics''" -output: [] +output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose()] --- name: bold_then_both label: text that starts just bold, then is bold/italic input: "'''bold''both'''''" -output: [] +output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenClose(), Text(text="b"), TagCloseClose()] --- name: italics_then_both label: text that starts just italic, then is bold/italic input: "''italics'''both'''''" -output: [] +output: [TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] --- name: seven label: seven ticks input: "'''''''seven'''''''" -output: [] +output: [Text(text="''"), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="seven''"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] --- name: complex_ul label: ul with a lot in it input: "* this is a test of an [[Unordered list|ul]] with {{plenty|of|stuff}}" -output: [] +output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="Unordered list"), WikilinkSeparator(), Text(text="ul"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] --- name: ul_multiline_template label: ul with a template that spans multiple lines input: "* this has a template with a {{line|\nbreak}}\nthis is not part of the list" -output: [] +output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")] --- name: ul_adjacent label: multiple adjacent uls input: "a\n*b\n*c\nd\n*e\nf" -output: [] +output: [Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf")] --- name: ul_depths label: multiple adjacent uls, with differing depths input: "*a\n**b\n***c\n********d\n**e\nf\n***g" -output: [] +output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="g")] --- name: ul_space_before label: uls with space before them input: "foo *bar\n *baz\n*buzz" -output: [] +output: [Text(text="foo *bar\n *baz\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="buzz")] --- name: ul_interruption label: high-depth ul with something blocking it input: "**f*oobar" -output: [] +output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="f*oobar")] --- name: complex_ol label: ol with a lot in it input: "# this is a test of an [[Ordered list|ol]] with {{plenty|of|stuff}}" -output: [] +output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="Ordered list"), WikilinkSeparator(), Text(text="ol"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] --- name: ol_multiline_template label: ol with a template that spans moltiple lines input: "# this has a template with a {{line|\nbreak}}\nthis is not part of the list" -output: [] +output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")] --- name: ol_adjacent label: moltiple adjacent ols input: "a\n#b\n#c\nd\n#e\nf" -output: [] +output: [Text(text="a\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf")] --- name: ol_depths label: moltiple adjacent ols, with differing depths input: "#a\n##b\n###c\n########d\n##e\nf\n###g" -output: [] +output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="g")] --- name: ol_space_before label: ols with space before them input: "foo #bar\n #baz\n#buzz" -output: [] +output: [Text(text="foo #bar\n #baz\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="buzz")] --- name: ol_interruption label: high-depth ol with something blocking it input: "##f#oobar" -output: [] +output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="f#oobar")] --- name: ul_ol_mix label: a mix of adjacent uls and ols input: "*a\n*#b\n*##c\n*##*#*#*d\n*#e\nf\n##*g" -output: [] +output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="g")] --- name: complex_dt label: dt with a lot in it input: "; this is a test of an [[description term|dt]] with {{plenty|of|stuff}}" -output: [] +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description term"), WikilinkSeparator(), Text(text="dt"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] --- name: dt_multiline_template label: dt with a template that spans mdttiple lines input: "; this has a template with a {{line|\nbreak}}\nthis is not part of the list" -output: [] +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")] --- name: dt_adjacent label: mdttiple adjacent dts input: ";\n;b\n;c\nd\n;e\nf" -output: [] +output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")] --- name: dt_depths label: mdttiple adjacent dts, with differing depths input: ";a\n;;b\n;;;c\n;;;;;;;;d\n;;e\nf\n;;;g" -output: [] +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="g")] --- name: dt_space_before label: dts with space before them input: "foo ;bar\n ;baz\n;buzz" -output: [] +output: [Text(text="foo ;bar\n ;baz\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="buzz")] --- name: dt_interruption label: high-depth dt with something blocking it input: ";;f;oobar" -output: [] +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="f;oobar")] --- name: complex_dd label: dd with a lot in it input: ": this is a :test of an [[description item|dd]] with {{plenty|of|stuff}}" -output: [] +output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] --- name: dd_multiline_template label: dd with a template that spans mddtiple lines input: ": this has a template with a {{line|\nbreak}}\nthis is not part of the list" -output: [] +output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")] --- name: dd_adjacent label: mddtiple adjacent dds input: ":\n:b\n:c\nd\n:e\nf" -output: [] +output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")] --- name: dd_depths label: mddtiple adjacent dds, with differing depths input: ":a\n::b\n:::c\n::::::::d\n::e\nf\n:::g" -output: [] +output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="g")] --- name: dd_space_before label: dds with space before them input: "foo :bar\n :baz\n:buzz" -output: [] +output: [Text(text="foo :bar\n :baz\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="buzz")] --- name: dd_interruption label: high-depth dd with something blocking it input: "::f:oobar" -output: [] +output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="f:oobar")] --- name: dt_dd_mix label: a mix of adjacent dts and dds input: ";a\n;:b\n;::c\n;::;:;:;d\n;:e\nf\n::;g" -output: [] +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="g")] --- name: dt_dd_mix2 label: the correct usage of a dt/dd unit, as in a dl input: ";foo:bar" -output: [] +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] --- name: dt_dd_mix3 label: another complex example of dts and dds input: ";:::;foo::;:bar;;" -output: [] +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=";:bar;;")] --- From c910951273f3958ee94345345f70eeab71c0ec69 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 8 Aug 2013 16:48:51 -0400 Subject: [PATCH 06/27] Implement horizontal rules (----) --- mwparserfromhell/parser/tokenizer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 50c7fbd..7754a6b 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -629,6 +629,17 @@ class Tokenizer(object): else: self._emit_all(tag) + def _parse_hr(self): + """Parse a wiki-style horizontal rule (``----``) at the string head.""" + length = 4 + self._head += 3 + while self._read(1) == "-": + length += 1 + self._head += 1 + self._emit(tokens.TagOpenOpen(wiki_markup="-" * length)) + self._emit_text("hr") + self._emit(tokens.TagCloseSelfclose()) + def _handle_end(self): """Handle the end of the stream of wikitext.""" fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | @@ -782,6 +793,11 @@ class Tokenizer(object): self._emit_text("<") elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() + elif this == next == "-" and self._read(-1) in ("\n", self.START): + if self._read(2) == self._read(3) == "-": + self._parse_hr() + else: + self._emit_text("-") else: self._emit_text(this) self._head += 1 From 7bce2f4e96da43e71fb1fc89f1cc5645ed32fce2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 8 Aug 2013 21:27:23 -0400 Subject: [PATCH 07/27] Parse *, #; add another test. --- mwparserfromhell/parser/tokenizer.py | 18 ++++++++++++++++-- tests/tokenizer/tags_wikimarkup.mwtest | 7 +++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7754a6b..ef45ee9 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -629,6 +629,18 @@ class Tokenizer(object): else: self._emit_all(tag) + def _parse_list(self): + """Parse a wiki-style list (``#``, ``*``, ``;``, ``:``).""" + def emit(): + self._emit(tokens.TagOpenOpen(wiki_markup=self._read())) + self._emit_text("li") + self._emit(tokens.TagCloseSelfclose()) + + emit() + while self._read(1) in ("#", "*"): + self._head += 1 + emit() + def _parse_hr(self): """Parse a wiki-style horizontal rule (``----``) at the string head.""" length = 4 @@ -793,8 +805,10 @@ class Tokenizer(object): self._emit_text("<") elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() - elif this == next == "-" and self._read(-1) in ("\n", self.START): - if self._read(2) == self._read(3) == "-": + elif self._read(-1) in ("\n", self.START): + if this in ("#", "*"): + self._parse_list() + elif this == next == self._read(2) == self._read(3) == "-": self._parse_hr() else: self._emit_text("-") diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index e1891f5..9ce71b6 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -370,6 +370,13 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tag --- +name: ul_ol_dt_dd_mix +label: an assortment of uls, ols, dds, and dts +input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="foo")] + +--- + name: hr_text_before label: text before an otherwise-valid hr input: "foo----" From 3288b80c15bc9f438dd48c355bb1a1b85114d0f3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 8 Aug 2013 21:28:42 -0400 Subject: [PATCH 08/27] Fix. --- mwparserfromhell/parser/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index ef45ee9..f167db4 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -811,7 +811,7 @@ class Tokenizer(object): elif this == next == self._read(2) == self._read(3) == "-": self._parse_hr() else: - self._emit_text("-") + self._emit_text(self._read()) else: self._emit_text(this) self._head += 1 From dd2a6f913b140fb9a1b81cfa7dbc41e5f5050b1c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 9 Aug 2013 20:42:19 -0400 Subject: [PATCH 09/27] Add support for dt, dd. Refactor. Fix some broken tests. --- mwparserfromhell/parser/contexts.py | 16 +++++++----- mwparserfromhell/parser/tokenizer.py | 46 ++++++++++++++++++++++------------ mwparserfromhell/tag_defs.py | 14 ++++++++++- tests/tokenizer/tags_wikimarkup.mwtest | 16 ++++++------ 4 files changed, 61 insertions(+), 31 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 211136c..2785708 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -69,6 +69,8 @@ Local (stack-specific) contexts: * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` +* :py:const:`DL_TERM` + * :py:const:`SAFETY_CHECK` * :py:const:`HAS_TEXT` @@ -115,12 +117,14 @@ TAG_BODY = 1 << 16 TAG_CLOSE = 1 << 17 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -HAS_TEXT = 1 << 18 -FAIL_ON_TEXT = 1 << 19 -FAIL_NEXT = 1 << 20 -FAIL_ON_LBRACE = 1 << 21 -FAIL_ON_RBRACE = 1 << 22 -FAIL_ON_EQUALS = 1 << 23 +DL_TERM = 1 << 18 + +HAS_TEXT = 1 << 19 +FAIL_ON_TEXT = 1 << 20 +FAIL_NEXT = 1 << 21 +FAIL_ON_LBRACE = 1 << 22 +FAIL_ON_RBRACE = 1 << 23 +FAIL_ON_EQUALS = 1 << 24 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f167db4..d3ce7bd 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,7 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..tag_defs import is_parsable, is_single, is_single_only +from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only __all__ = ["Tokenizer"] @@ -629,20 +629,24 @@ class Tokenizer(object): else: self._emit_all(tag) - def _parse_list(self): - """Parse a wiki-style list (``#``, ``*``, ``;``, ``:``).""" - def emit(): - self._emit(tokens.TagOpenOpen(wiki_markup=self._read())) - self._emit_text("li") - self._emit(tokens.TagCloseSelfclose()) + def _handle_list_marker(self): + """Handle a list marker at the head (``#``, ``*``, ``;``, ``:``).""" + markup = self._read() + if markup == ";": + self._context |= contexts.DL_TERM + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + self._emit_text(get_html_tag(markup)) + self._emit(tokens.TagCloseSelfclose()) - emit() - while self._read(1) in ("#", "*"): + def _handle_list(self): + """Handle a wiki-style list (``#``, ``*``, ``;``, ``:``).""" + self._handle_list_marker() + while self._read(1) in ("#", "*", ";", ":"): self._head += 1 - emit() + self._handle_list_marker() - def _parse_hr(self): - """Parse a wiki-style horizontal rule (``----``) at the string head.""" + def _handle_hr(self): + """Handle a wiki-style horizontal rule (``----``) in the string.""" length = 4 self._head += 3 while self._read(1) == "-": @@ -652,6 +656,14 @@ class Tokenizer(object): self._emit_text("hr") self._emit(tokens.TagCloseSelfclose()) + def _handle_dl_term(self): + """Handle the term in a description list (``foo`` in ``;foo:bar``).""" + self._context ^= contexts.DL_TERM + if self._read() == ":": + self._handle_list_marker() + else: + self._emit_text("\n") + def _handle_end(self): """Handle the end of the stream of wikitext.""" fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | @@ -806,12 +818,14 @@ class Tokenizer(object): elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() elif self._read(-1) in ("\n", self.START): - if this in ("#", "*"): - self._parse_list() + if this in ("#", "*", ";", ":"): + self._handle_list() elif this == next == self._read(2) == self._read(3) == "-": - self._parse_hr() + self._handle_hr() else: - self._emit_text(self._read()) + self._emit_text(this) + elif this in ("\n", ":") and self._context & contexts.DL_TERM: + self._handle_dl_term() else: self._emit_text(this) self._head += 1 diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index 94e0ac4..2395fc6 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -24,7 +24,8 @@ from __future__ import unicode_literals -__all__ = ["is_parsable", "is_visible", "is_single", "is_single_only"] +__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", + "is_single_only"] PARSER_BLACKLIST = [ # enwiki extensions @ 2013-06-28 @@ -43,6 +44,17 @@ INVISIBLE_TAGS = [ SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] +MARKUP_TO_HTML = { + "#": "li", + "*": "li", + ";": "dt", + ":": "dd" +} + +def get_html_tag(markup): + """Return the HTML tag associated with the given wiki-markup.""" + return MARKUP_TO_HTML[markup] + def is_parsable(tag): """Return if the given *tag*'s contents should be passed to the parser.""" return tag.lower() not in PARSER_BLACKLIST diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 9ce71b6..c9664fb 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -281,7 +281,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex name: dt_adjacent label: mdttiple adjacent dts -input: ";\n;b\n;c\nd\n;e\nf" +input: "a\n;b\n;c\nd\n;e\nf" output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")] --- @@ -309,7 +309,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tag name: complex_dd label: dd with a lot in it -input: ": this is a :test of an [[description item|dd]] with {{plenty|of|stuff}}" +input: ": this is a test of an [[description item|dd]] with {{plenty|of|stuff}}" output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] --- @@ -323,7 +323,7 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tex name: dd_adjacent label: mddtiple adjacent dds -input: ":\n:b\n:c\nd\n:e\nf" +input: "a\n:b\n:c\nd\n:e\nf" output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")] --- @@ -358,15 +358,15 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex name: dt_dd_mix2 label: the correct usage of a dt/dd unit, as in a dl -input: ";foo:bar" -output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] +input: ";foo:bar:baz" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")] --- name: dt_dd_mix3 -label: another complex example of dts and dds -input: ";:::;foo::;:bar;;" -output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=";:bar;;")] +label: another example of correct (but strange) dt/dd usage +input: ":;;::foo:bar:baz" +output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")] --- From 851a9e586363cdf8612098afaad00915c2ef26b0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 10 Aug 2013 00:21:29 -0400 Subject: [PATCH 10/27] Add a couple tests and fix a couple broken ones. --- tests/tokenizer/tags_wikimarkup.mwtest | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index c9664fb..6644cd0 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -57,7 +57,7 @@ output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(tex name: multiline_italics label: italics spanning mulitple lines input: "foo\nbar''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines''foo\n\nbar" -output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="i"), TagCloseClose()] +output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="foo\n\nbar")] --- @@ -100,7 +100,7 @@ output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(te name: multiline_bold label: bold spanning mulitple lines input: "foo\nbar'''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines'''foo\n\nbar" -output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="b"), TagCloseClose()] +output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="foo\n\nbar")] --- @@ -167,6 +167,20 @@ output: [TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text --- +name: four +label: four ticks +input: "foo ''''bar'''' baz" +output: [Text(text="foo '"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bar'"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text=" baz")] + +--- + +name: four_two +label: four ticks to open, two to end +input: "foo ''''bar'' baz" +output: [Text(text="foo ''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")] + +--- + name: seven label: seven ticks input: "'''''''seven'''''''" From 2a82a57b2d39757d360483ba22d089ad7d149deb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 10 Aug 2013 00:21:53 -0400 Subject: [PATCH 11/27] Add support for bold and italic text (part one). --- mwparserfromhell/parser/contexts.py | 23 ++++++++---- mwparserfromhell/parser/tokenizer.py | 69 +++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 2785708..2ae3cc3 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -69,6 +69,11 @@ Local (stack-specific) contexts: * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` +* :py:const:`STYLE` + + * :py:const:`STYLE_ITALICS` + * :py:const:`STYLE_BOLD` + * :py:const:`DL_TERM` * :py:const:`SAFETY_CHECK` @@ -117,14 +122,18 @@ TAG_BODY = 1 << 16 TAG_CLOSE = 1 << 17 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -DL_TERM = 1 << 18 +STYLE_ITALICS = 1 << 18 +STYLE_BOLD = 1 << 19 +STYLE = STYLE_ITALICS + STYLE_BOLD + +DL_TERM = 1 << 20 -HAS_TEXT = 1 << 19 -FAIL_ON_TEXT = 1 << 20 -FAIL_NEXT = 1 << 21 -FAIL_ON_LBRACE = 1 << 22 -FAIL_ON_RBRACE = 1 << 23 -FAIL_ON_EQUALS = 1 << 24 +HAS_TEXT = 1 << 21 +FAIL_ON_TEXT = 1 << 22 +FAIL_NEXT = 1 << 23 +FAIL_ON_LBRACE = 1 << 24 +FAIL_ON_RBRACE = 1 << 25 +FAIL_ON_EQUALS = 1 << 26 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d3ce7bd..650e605 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -57,11 +57,11 @@ class Tokenizer(object): USES_C = False START = object() END = object() - MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", - "/", "-", "\n", END] + MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", + ":", "/", "-", "\n", END] MAX_DEPTH = 40 MAX_CYCLES = 100000 - regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE) + regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) tag_splitter = re.compile(r"([\s\"\\]+)") def __init__(self): @@ -629,6 +629,58 @@ class Tokenizer(object): else: self._emit_all(tag) + def _really_parse_style(self, context): + """Parse wiki-style bold or italics. Raises :py:exc:`BadRoute`.""" + stack = self._parse(context) + markup = "''" if context == contexts.STYLE_ITALICS else "'''" + tag = "i" if context == contexts.STYLE_ITALICS else "b" + + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + self._emit_text(tag) + self._emit(tokens.TagCloseOpen()) + self._emit_all(stack) + self._emit(tokens.TagOpenClose()) + self._emit_text(tag) + self._emit(tokens.TagCloseClose()) + + def _parse_style(self): + """Parse wiki-style formatting (``''``/``'''`` for italics/bold).""" + self._head += 2 + ticks = 2 + while self._read() == "'": + self._head += 1 + ticks += 1 + reset = self._head + + if ticks > 5: + self._emit_text("'" * (ticks - 5)) + ticks = 5 + elif ticks == 4: + self._emit_text("'") + ticks = 3 + + if ticks == 5: + raise NotImplementedError() + if ticks == 3: + try: + return self._really_parse_style(contexts.STYLE_BOLD) + except BadRoute: + self._emit_text("'") + self._head = reset + try: + self._really_parse_style(contexts.STYLE_ITALICS) + except BadRoute: + self._emit_text("''") + self._head = reset - 1 + + def _handle_style_end(self): + """Handle the end of wiki-style italics or bold (``''`` or ``'''``).""" + self._head += 1 if self._context & contexts.STYLE_ITALICS else 2 + while self._read(1) == "'": + self._emit_text("'") + self._head += 1 + return self._pop() + def _handle_list_marker(self): """Handle a list marker at the head (``#``, ``*``, ``;``, ``:``).""" markup = self._read() @@ -667,7 +719,8 @@ class Tokenizer(object): def _handle_end(self): """Handle the end of the stream of wikitext.""" fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | - contexts.HEADING | contexts.COMMENT | contexts.TAG) + contexts.HEADING | contexts.COMMENT | contexts.TAG | + contexts.STYLE) double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) if self._context & fail: if self._context & contexts.TAG_BODY: @@ -817,6 +870,14 @@ class Tokenizer(object): self._emit_text("<") elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() + elif this == next == "'": + if not self._context & contexts.STYLE and self._can_recurse(): + self._parse_style() + elif (self._context & contexts.STYLE_ITALICS or + self._read(2) == "'" and self._context & contexts.STYLE_BOLD): + return self._handle_style_end() + else: + self._emit_text("'") elif self._read(-1) in ("\n", self.START): if this in ("#", "*", ";", ":"): self._handle_list() From 28fa6a2037841c756ab8342edf7754f01772397e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 11 Aug 2013 00:51:13 -0400 Subject: [PATCH 12/27] Add some tests and a new context. --- mwparserfromhell/parser/contexts.py | 18 +++++++------ tests/tokenizer/tags_wikimarkup.mwtest | 48 +++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 2ae3cc3..a7e70fa 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -73,6 +73,7 @@ Local (stack-specific) contexts: * :py:const:`STYLE_ITALICS` * :py:const:`STYLE_BOLD` + * :py:const:`STYLE_PASS_2` * :py:const:`DL_TERM` @@ -124,16 +125,17 @@ TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE STYLE_ITALICS = 1 << 18 STYLE_BOLD = 1 << 19 -STYLE = STYLE_ITALICS + STYLE_BOLD +STYLE_PASS_2 = 1 << 20 +STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_2 -DL_TERM = 1 << 20 +DL_TERM = 1 << 21 -HAS_TEXT = 1 << 21 -FAIL_ON_TEXT = 1 << 22 -FAIL_NEXT = 1 << 23 -FAIL_ON_LBRACE = 1 << 24 -FAIL_ON_RBRACE = 1 << 25 -FAIL_ON_EQUALS = 1 << 26 +HAS_TEXT = 1 << 22 +FAIL_ON_TEXT = 1 << 23 +FAIL_NEXT = 1 << 24 +FAIL_ON_LBRACE = 1 << 25 +FAIL_ON_RBRACE = 1 << 26 +FAIL_ON_EQUALS = 1 << 27 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 6644cd0..da08a34 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -163,7 +163,21 @@ output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(te name: italics_then_both label: text that starts just italic, then is bold/italic input: "''italics'''both'''''" -output: [TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] + +--- + +name: italics_then_bold +label: text that starts italic, then is bold +input: "none''italics'''''bold'''none" +output: [Text(text="none"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose() TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="none")] + +--- + +name: bold_then_italics +label: text that starts bold, then is italic +input: "none'''bold'''''italics''none" +output: [Text(text="none"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose() TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="none")] --- @@ -175,16 +189,44 @@ output: [Text(text="foo '"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), Tag --- name: four_two -label: four ticks to open, two to end +label: four ticks to open, two to close input: "foo ''''bar'' baz" output: [Text(text="foo ''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")] --- +name: two_three +label: two ticks to open, three to close +input: "foo ''bar''' baz" +output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar'"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")] + +--- + +name: two_four +label: two ticks to open, four to close +input: "foo ''bar'''' baz" +output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar''"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")] + +--- + +name: two_three_two +label: two ticks to open, three to close, two afterwards +input: "foo ''bar''' baz''" +output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar''' baz"), TagOpenClose(), Text(text="i"), TagCloseClose()] + +--- + +name: two_four_four +label: two ticks to open, four to close, four afterwards +input: "foo ''bar'''' baz''''" +output: [Text(text="foo bar'"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), " baz'", TagOpenClose(), Text(text="b"), TagCloseClose()] + +--- + name: seven label: seven ticks input: "'''''''seven'''''''" -output: [Text(text="''"), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="seven''"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] +output: [Text(text="''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="seven''"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()] --- From 55202aadb0da350d02277a31ed47b08efd28b76f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 14:24:10 -0400 Subject: [PATCH 13/27] Fix some broken tests. --- tests/tokenizer/tags_wikimarkup.mwtest | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index da08a34..d75968d 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -170,14 +170,14 @@ output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(tex name: italics_then_bold label: text that starts italic, then is bold input: "none''italics'''''bold'''none" -output: [Text(text="none"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose() TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="none")] +output: [Text(text="none"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="none")] --- name: bold_then_italics label: text that starts bold, then is italic input: "none'''bold'''''italics''none" -output: [Text(text="none"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose() TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="none")] +output: [Text(text="none"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="none")] --- @@ -219,7 +219,7 @@ output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCl name: two_four_four label: two ticks to open, four to close, four afterwards input: "foo ''bar'''' baz''''" -output: [Text(text="foo bar'"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), " baz'", TagOpenClose(), Text(text="b"), TagCloseClose()] +output: [Text(text="foo ''bar'"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text=" baz'"), TagOpenClose(), Text(text="b"), TagCloseClose()] --- From d6446d5d90fbbcca61e4226ccd0bdeab615bc1eb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 15:56:30 -0400 Subject: [PATCH 14/27] Finish test coverage for tags. --- tests/tokenizer/tags_wikimarkup.mwtest | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index d75968d..632ba72 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -181,6 +181,20 @@ output: [Text(text="none"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagC --- +name: five_three +label: five ticks to open, three to close (bold) +input: "'''''foobar'''" +output: [Text(text="''"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="foobar"), TagOpenClose(), Text(text="b"), TagCloseClose()] + +--- + +name: five_two +label: five ticks to open, two to close (bold) +input: "'''''foobar''" +output: [Text(text="'''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="foobar"), TagOpenClose(), Text(text="i"), TagCloseClose()] + +--- + name: four label: four ticks input: "foo ''''bar'''' baz" From 992e7018ae6379f1b3f64fde4e23ab73eb0da7c2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 15:58:55 -0400 Subject: [PATCH 15/27] Working bold/italics implementation (ugly, will clean up) --- mwparserfromhell/parser/tokenizer.py | 155 ++++++++++++++++++++++++++++------- 1 file changed, 126 insertions(+), 29 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 650e605..bbeefd6 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -632,8 +632,8 @@ class Tokenizer(object): def _really_parse_style(self, context): """Parse wiki-style bold or italics. Raises :py:exc:`BadRoute`.""" stack = self._parse(context) - markup = "''" if context == contexts.STYLE_ITALICS else "'''" - tag = "i" if context == contexts.STYLE_ITALICS else "b" + markup = "''" if context & contexts.STYLE_ITALICS else "'''" + tag = "i" if context & contexts.STYLE_ITALICS else "b" self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) @@ -659,27 +659,128 @@ class Tokenizer(object): self._emit_text("'") ticks = 3 - if ticks == 5: - raise NotImplementedError() - if ticks == 3: - try: - return self._really_parse_style(contexts.STYLE_BOLD) - except BadRoute: + if ticks == 2: + if self._context & contexts.STYLE_ITALICS: + return self._pop() + if self._can_recurse(): + try: + self._really_parse_style(contexts.STYLE_ITALICS) + except BadRoute: + self._head = reset + try: ## only if STYLE_PASS_AGAIN in destroyed context + self._really_parse_style(contexts.STYLE_ITALICS|contexts.STYLE_PASS_2) + except BadRoute: + self._head = reset + self._emit_text("''") + else: + self._emit_text("''") + elif ticks == 3: + if self._context & contexts.STYLE_BOLD: + return self._pop() + elif self._can_recurse(): + try: + self._really_parse_style(contexts.STYLE_BOLD) + except BadRoute: + self._head = reset + if self._context & contexts.STYLE_ITALICS: + if self._context & contexts.STYLE_PASS_2: + self._emit_text("'") + return self._pop() + self._emit_text("'''") ## here is our hook for STYLE_PASS_AGAIN + else: + self._emit_text("'") + try: + self._really_parse_style(contexts.STYLE_ITALICS) + except BadRoute: + self._head = reset + try: ## only if STYLE_PASS_AGAIN in destroyed context + self._really_parse_style(contexts.STYLE_ITALICS|contexts.STYLE_PASS_2) + except BadRoute: + self._head = reset + self._emit_text("''") + elif self._context & contexts.STYLE_ITALICS and self._context & contexts.STYLE_PASS_2: self._emit_text("'") - self._head = reset - try: - self._really_parse_style(contexts.STYLE_ITALICS) - except BadRoute: - self._emit_text("''") - self._head = reset - 1 - - def _handle_style_end(self): - """Handle the end of wiki-style italics or bold (``''`` or ``'''``).""" - self._head += 1 if self._context & contexts.STYLE_ITALICS else 2 - while self._read(1) == "'": - self._emit_text("'") - self._head += 1 - return self._pop() + return self._pop() + else: ## here is our hook for STYLE_PASS_AGAIN + self._emit_text("'''") + elif ticks == 5: + if self._context & contexts.STYLE_ITALICS: + self._head -= 3 + return self._pop() + elif self._context & contexts.STYLE_BOLD: + self._head -= 2 + return self._pop() + elif self._can_recurse(): + try: + stack = self._parse(contexts.STYLE_BOLD) + except BadRoute: + self._head = reset + try: + stack = self._parse(contexts.STYLE_ITALICS) + except BadRoute: + self._head = reset + self._emit_text("'''''") + else: + reset = self._head + try: + stack2 = self._parse(contexts.STYLE_BOLD) + except BadRoute: + self._head = reset + self._emit_text("'''") + self._emit(tokens.TagOpenOpen(wiki_markup="''")) + self._emit_text("i") + self._emit(tokens.TagCloseOpen()) + self._emit_all(stack) + self._emit(tokens.TagOpenClose()) + self._emit_text("i") + self._emit(tokens.TagCloseClose()) + else: + self._emit(tokens.TagOpenOpen(wiki_markup="'''")) + self._emit_text("b") + self._emit(tokens.TagCloseOpen()) + self._emit(tokens.TagOpenOpen(wiki_markup="''")) + self._emit_text("i") + self._emit(tokens.TagCloseOpen()) + self._emit_all(stack) + self._emit(tokens.TagOpenClose()) + self._emit_text("i") + self._emit(tokens.TagCloseClose()) + self._emit_all(stack2) + self._emit(tokens.TagOpenClose()) + self._emit_text("b") + self._emit(tokens.TagCloseClose()) + else: + reset = self._head + try: + stack2 = self._parse(contexts.STYLE_ITALICS) + except BadRoute: + self._head = reset + self._emit_text("''") + self._emit(tokens.TagOpenOpen(wiki_markup="'''")) + self._emit_text("b") + self._emit(tokens.TagCloseOpen()) + self._emit_all(stack) + self._emit(tokens.TagOpenClose()) + self._emit_text("b") + self._emit(tokens.TagCloseClose()) + else: + self._emit(tokens.TagOpenOpen(wiki_markup="''")) + self._emit_text("i") + self._emit(tokens.TagCloseOpen()) + self._emit(tokens.TagOpenOpen(wiki_markup="'''")) + self._emit_text("b") + self._emit(tokens.TagCloseOpen()) + self._emit_all(stack) + self._emit(tokens.TagOpenClose()) + self._emit_text("b") + self._emit(tokens.TagCloseClose()) + self._emit_all(stack2) + self._emit(tokens.TagOpenClose()) + self._emit_text("i") + self._emit(tokens.TagCloseClose()) + else: + self._emit_text("'''''") + self._head -= 1 def _handle_list_marker(self): """Handle a list marker at the head (``#``, ``*``, ``;``, ``:``).""" @@ -871,13 +972,9 @@ class Tokenizer(object): elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() elif this == next == "'": - if not self._context & contexts.STYLE and self._can_recurse(): - self._parse_style() - elif (self._context & contexts.STYLE_ITALICS or - self._read(2) == "'" and self._context & contexts.STYLE_BOLD): - return self._handle_style_end() - else: - self._emit_text("'") + result = self._parse_style() + if result is not None: + return result elif self._read(-1) in ("\n", self.START): if this in ("#", "*", ";", ":"): self._handle_list() From dd4591c270577e1558f140a767d60986ed32820b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 17:12:44 -0400 Subject: [PATCH 16/27] Much refactoring and cleanup. --- mwparserfromhell/parser/tokenizer.py | 236 ++++++++++++++++------------------- 1 file changed, 110 insertions(+), 126 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index bbeefd6..89481d8 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -629,20 +629,106 @@ class Tokenizer(object): else: self._emit_all(tag) - def _really_parse_style(self, context): - """Parse wiki-style bold or italics. Raises :py:exc:`BadRoute`.""" - stack = self._parse(context) - markup = "''" if context & contexts.STYLE_ITALICS else "'''" - tag = "i" if context & contexts.STYLE_ITALICS else "b" - + def _emit_tag_open(self, tag, markup): + """Write the three tokens in a tag opening sequence.""" self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) self._emit(tokens.TagCloseOpen()) - self._emit_all(stack) + + def _emit_tag_close(self, tag): + """Write the three tokens in a tag closing sequence.""" self._emit(tokens.TagOpenClose()) self._emit_text(tag) self._emit(tokens.TagCloseClose()) + def _really_parse_style(self, context, reset, markup, tag): + """Parse wiki-style bold or italics. Raises :py:exc:`BadRoute`.""" + if context & contexts.STYLE_ITALICS: + try: + stack = self._parse(context) + except BadRoute: ## only if STYLE_PASS_AGAIN in destroyed context + self._head = reset + stack = self._parse(context | contexts.STYLE_PASS_2) + else: + stack = self._parse(context) + + self._emit_tag_open(tag, markup) + self._emit_all(stack) + self._emit_tag_close(tag) + + def _parse_italics(self): + """Parse wiki-style italics.""" + reset = self._head + try: + self._really_parse_style(contexts.STYLE_ITALICS, reset, "''", "i") + except BadRoute: + self._head = reset + self._emit_text("''") + + def _parse_bold(self): + """Parse wiki-style bold.""" + reset = self._head + try: + self._really_parse_style(contexts.STYLE_BOLD, reset, "'''", "b") + except BadRoute: + self._head = reset + if self._context & contexts.STYLE_PASS_2: + self._emit_text("'") + return True + elif self._context & contexts.STYLE_ITALICS: + # Set STYLE_PASS_AGAIN + self._emit_text("'''") + else: + self._emit_text("'") + self._parse_italics() + + def _parse_italics_and_bold(self): + """Parse wiki-style italics and bold together (i.e., five ticks).""" + reset = self._head + try: + stack = self._parse(contexts.STYLE_BOLD) + except BadRoute: + self._head = reset + try: + stack = self._parse(contexts.STYLE_ITALICS) + except BadRoute: + self._head = reset + self._emit_text("'''''") + else: + reset = self._head + try: + stack2 = self._parse(contexts.STYLE_BOLD) + except BadRoute: + self._head = reset + self._emit_text("'''") + self._emit_tag_open("i", "''") + self._emit_all(stack) + self._emit_tag_close("i") + else: + self._emit_tag_open("b", "'''") + self._emit_tag_open("i", "''") + self._emit_all(stack) + self._emit_tag_close("i") + self._emit_all(stack2) + self._emit_tag_close("b") + else: + reset = self._head + try: + stack2 = self._parse(contexts.STYLE_ITALICS) + except BadRoute: + self._head = reset + self._emit_text("''") + self._emit_tag_open("b", "'''") + self._emit_all(stack) + self._emit_tag_close("b") + else: + self._emit_tag_open("i", "''") + self._emit_tag_open("b", "'''") + self._emit_all(stack) + self._emit_tag_close("b") + self._emit_all(stack2) + self._emit_tag_close("i") + def _parse_style(self): """Parse wiki-style formatting (``''``/``'''`` for italics/bold).""" self._head += 2 @@ -650,7 +736,8 @@ class Tokenizer(object): while self._read() == "'": self._head += 1 ticks += 1 - reset = self._head + italics = self._context & contexts.STYLE_ITALICS + bold = self._context & contexts.STYLE_BOLD if ticks > 5: self._emit_text("'" * (ticks - 5)) @@ -659,127 +746,24 @@ class Tokenizer(object): self._emit_text("'") ticks = 3 - if ticks == 2: - if self._context & contexts.STYLE_ITALICS: - return self._pop() - if self._can_recurse(): - try: - self._really_parse_style(contexts.STYLE_ITALICS) - except BadRoute: - self._head = reset - try: ## only if STYLE_PASS_AGAIN in destroyed context - self._really_parse_style(contexts.STYLE_ITALICS|contexts.STYLE_PASS_2) - except BadRoute: - self._head = reset - self._emit_text("''") - else: - self._emit_text("''") + if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)): + if ticks == 5: + self._head -= 3 if italics else 2 + return self._pop() + elif not self._can_recurse(): + if ticks == 3: + if self._context & contexts.STYLE_PASS_2: + self._emit_text("'") + return self._pop() + # Set STYLE_PASS_AGAIN + self._emit_text("'" * ticks) + elif ticks == 2: + self._parse_italics() elif ticks == 3: - if self._context & contexts.STYLE_BOLD: + if self._parse_bold(): return self._pop() - elif self._can_recurse(): - try: - self._really_parse_style(contexts.STYLE_BOLD) - except BadRoute: - self._head = reset - if self._context & contexts.STYLE_ITALICS: - if self._context & contexts.STYLE_PASS_2: - self._emit_text("'") - return self._pop() - self._emit_text("'''") ## here is our hook for STYLE_PASS_AGAIN - else: - self._emit_text("'") - try: - self._really_parse_style(contexts.STYLE_ITALICS) - except BadRoute: - self._head = reset - try: ## only if STYLE_PASS_AGAIN in destroyed context - self._really_parse_style(contexts.STYLE_ITALICS|contexts.STYLE_PASS_2) - except BadRoute: - self._head = reset - self._emit_text("''") - elif self._context & contexts.STYLE_ITALICS and self._context & contexts.STYLE_PASS_2: - self._emit_text("'") - return self._pop() - else: ## here is our hook for STYLE_PASS_AGAIN - self._emit_text("'''") elif ticks == 5: - if self._context & contexts.STYLE_ITALICS: - self._head -= 3 - return self._pop() - elif self._context & contexts.STYLE_BOLD: - self._head -= 2 - return self._pop() - elif self._can_recurse(): - try: - stack = self._parse(contexts.STYLE_BOLD) - except BadRoute: - self._head = reset - try: - stack = self._parse(contexts.STYLE_ITALICS) - except BadRoute: - self._head = reset - self._emit_text("'''''") - else: - reset = self._head - try: - stack2 = self._parse(contexts.STYLE_BOLD) - except BadRoute: - self._head = reset - self._emit_text("'''") - self._emit(tokens.TagOpenOpen(wiki_markup="''")) - self._emit_text("i") - self._emit(tokens.TagCloseOpen()) - self._emit_all(stack) - self._emit(tokens.TagOpenClose()) - self._emit_text("i") - self._emit(tokens.TagCloseClose()) - else: - self._emit(tokens.TagOpenOpen(wiki_markup="'''")) - self._emit_text("b") - self._emit(tokens.TagCloseOpen()) - self._emit(tokens.TagOpenOpen(wiki_markup="''")) - self._emit_text("i") - self._emit(tokens.TagCloseOpen()) - self._emit_all(stack) - self._emit(tokens.TagOpenClose()) - self._emit_text("i") - self._emit(tokens.TagCloseClose()) - self._emit_all(stack2) - self._emit(tokens.TagOpenClose()) - self._emit_text("b") - self._emit(tokens.TagCloseClose()) - else: - reset = self._head - try: - stack2 = self._parse(contexts.STYLE_ITALICS) - except BadRoute: - self._head = reset - self._emit_text("''") - self._emit(tokens.TagOpenOpen(wiki_markup="'''")) - self._emit_text("b") - self._emit(tokens.TagCloseOpen()) - self._emit_all(stack) - self._emit(tokens.TagOpenClose()) - self._emit_text("b") - self._emit(tokens.TagCloseClose()) - else: - self._emit(tokens.TagOpenOpen(wiki_markup="''")) - self._emit_text("i") - self._emit(tokens.TagCloseOpen()) - self._emit(tokens.TagOpenOpen(wiki_markup="'''")) - self._emit_text("b") - self._emit(tokens.TagCloseOpen()) - self._emit_all(stack) - self._emit(tokens.TagOpenClose()) - self._emit_text("b") - self._emit(tokens.TagCloseClose()) - self._emit_all(stack2) - self._emit(tokens.TagOpenClose()) - self._emit_text("i") - self._emit(tokens.TagCloseClose()) - else: - self._emit_text("'''''") + self._parse_italics_and_bold() self._head -= 1 def _handle_list_marker(self): From 4c0e4402b4cb2aa27f37df710925e401efcf8cf5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 17:29:05 -0400 Subject: [PATCH 17/27] Only do a second pass if one would produce a different result. --- mwparserfromhell/parser/contexts.py | 26 ++++++++++++++------------ mwparserfromhell/parser/tokenizer.py | 21 +++++++++++++-------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index a7e70fa..d3f0254 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -73,7 +73,8 @@ Local (stack-specific) contexts: * :py:const:`STYLE_ITALICS` * :py:const:`STYLE_BOLD` - * :py:const:`STYLE_PASS_2` + * :py:const:`STYLE_PASS_AGAIN` + * :py:const:`STYLE_SECOND_PASS` * :py:const:`DL_TERM` @@ -123,19 +124,20 @@ TAG_BODY = 1 << 16 TAG_CLOSE = 1 << 17 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -STYLE_ITALICS = 1 << 18 -STYLE_BOLD = 1 << 19 -STYLE_PASS_2 = 1 << 20 -STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_2 +STYLE_ITALICS = 1 << 18 +STYLE_BOLD = 1 << 19 +STYLE_PASS_AGAIN = 1 << 20 +STYLE_SECOND_PASS = 1 << 21 +STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS -DL_TERM = 1 << 21 +DL_TERM = 1 << 22 -HAS_TEXT = 1 << 22 -FAIL_ON_TEXT = 1 << 23 -FAIL_NEXT = 1 << 24 -FAIL_ON_LBRACE = 1 << 25 -FAIL_ON_RBRACE = 1 << 26 -FAIL_ON_EQUALS = 1 << 27 +HAS_TEXT = 1 << 23 +FAIL_ON_TEXT = 1 << 24 +FAIL_NEXT = 1 << 25 +FAIL_ON_LBRACE = 1 << 26 +FAIL_ON_RBRACE = 1 << 27 +FAIL_ON_EQUALS = 1 << 28 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 89481d8..4b9b9db 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -32,7 +32,9 @@ __all__ = ["Tokenizer"] class BadRoute(Exception): """Raised internally when the current tokenization route is invalid.""" - pass + + def __init__(self, context=0): + self.context = context class _TagOpenData(object): @@ -132,8 +134,9 @@ class Tokenizer(object): Discards the current stack/context/textbuffer and raises :py:exc:`~.BadRoute`. """ + context = self._context self._pop() - raise BadRoute() + raise BadRoute(context) def _emit(self, token): """Write a token to the end of the current token stack.""" @@ -646,9 +649,11 @@ class Tokenizer(object): if context & contexts.STYLE_ITALICS: try: stack = self._parse(context) - except BadRoute: ## only if STYLE_PASS_AGAIN in destroyed context + except BadRoute as route: + if not route.context & contexts.STYLE_PASS_AGAIN: + raise self._head = reset - stack = self._parse(context | contexts.STYLE_PASS_2) + stack = self._parse(context | contexts.STYLE_SECOND_PASS) else: stack = self._parse(context) @@ -672,11 +677,11 @@ class Tokenizer(object): self._really_parse_style(contexts.STYLE_BOLD, reset, "'''", "b") except BadRoute: self._head = reset - if self._context & contexts.STYLE_PASS_2: + if self._context & contexts.STYLE_SECOND_PASS: self._emit_text("'") return True elif self._context & contexts.STYLE_ITALICS: - # Set STYLE_PASS_AGAIN + self._context |= contexts.STYLE_PASS_AGAIN self._emit_text("'''") else: self._emit_text("'") @@ -752,10 +757,10 @@ class Tokenizer(object): return self._pop() elif not self._can_recurse(): if ticks == 3: - if self._context & contexts.STYLE_PASS_2: + if self._context & contexts.STYLE_SECOND_PASS: self._emit_text("'") return self._pop() - # Set STYLE_PASS_AGAIN + self._context |= contexts.STYLE_PASS_AGAIN self._emit_text("'" * ticks) elif ticks == 2: self._parse_italics() From 120f53deec842ca93dfdcee14e677433d5979774 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 17:39:21 -0400 Subject: [PATCH 18/27] Refactor out _really_parse_style() --- mwparserfromhell/parser/tokenizer.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 4b9b9db..1dcc194 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -644,37 +644,27 @@ class Tokenizer(object): self._emit_text(tag) self._emit(tokens.TagCloseClose()) - def _really_parse_style(self, context, reset, markup, tag): - """Parse wiki-style bold or italics. Raises :py:exc:`BadRoute`.""" - if context & contexts.STYLE_ITALICS: - try: - stack = self._parse(context) - except BadRoute as route: - if not route.context & contexts.STYLE_PASS_AGAIN: - raise - self._head = reset - stack = self._parse(context | contexts.STYLE_SECOND_PASS) - else: - stack = self._parse(context) - - self._emit_tag_open(tag, markup) - self._emit_all(stack) - self._emit_tag_close(tag) - def _parse_italics(self): """Parse wiki-style italics.""" reset = self._head try: - self._really_parse_style(contexts.STYLE_ITALICS, reset, "''", "i") - except BadRoute: + stack = self._parse(contexts.STYLE_ITALICS) + except BadRoute as route: self._head = reset - self._emit_text("''") + if route.context & contexts.STYLE_PASS_AGAIN: + stack = self._parse(route.context | contexts.STYLE_SECOND_PASS) + else: + return self._emit_text("''") + + self._emit_tag_open("i", "''") + self._emit_all(stack) + self._emit_tag_close("i") def _parse_bold(self): """Parse wiki-style bold.""" reset = self._head try: - self._really_parse_style(contexts.STYLE_BOLD, reset, "'''", "b") + stack = self._parse(contexts.STYLE_BOLD) except BadRoute: self._head = reset if self._context & contexts.STYLE_SECOND_PASS: @@ -686,6 +676,10 @@ class Tokenizer(object): else: self._emit_text("'") self._parse_italics() + else: + self._emit_tag_open("b", "'''") + self._emit_all(stack) + self._emit_tag_close("b") def _parse_italics_and_bold(self): """Parse wiki-style italics and bold together (i.e., five ticks).""" From 38143e1adca9479bad9eda7653943ba8e765efe6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 20:11:10 -0400 Subject: [PATCH 19/27] More refactoring (combine _emit_tag_open() and _emit_tag_close()) --- mwparserfromhell/parser/tokenizer.py | 41 ++++++++++++------------------------ 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 1dcc194..d4197e6 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -632,14 +632,12 @@ class Tokenizer(object): else: self._emit_all(tag) - def _emit_tag_open(self, tag, markup): - """Write the three tokens in a tag opening sequence.""" + def _emit_style_tag(self, tag, markup, body): + """Write the body of a tag and the tokens that should surround it.""" self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) self._emit(tokens.TagCloseOpen()) - - def _emit_tag_close(self, tag): - """Write the three tokens in a tag closing sequence.""" + self._emit_all(body) self._emit(tokens.TagOpenClose()) self._emit_text(tag) self._emit(tokens.TagCloseClose()) @@ -655,10 +653,7 @@ class Tokenizer(object): stack = self._parse(route.context | contexts.STYLE_SECOND_PASS) else: return self._emit_text("''") - - self._emit_tag_open("i", "''") - self._emit_all(stack) - self._emit_tag_close("i") + self._emit_style_tag("i", "''", stack) def _parse_bold(self): """Parse wiki-style bold.""" @@ -677,9 +672,7 @@ class Tokenizer(object): self._emit_text("'") self._parse_italics() else: - self._emit_tag_open("b", "'''") - self._emit_all(stack) - self._emit_tag_close("b") + self._emit_style_tag("b", "'''", stack) def _parse_italics_and_bold(self): """Parse wiki-style italics and bold together (i.e., five ticks).""" @@ -700,16 +693,12 @@ class Tokenizer(object): except BadRoute: self._head = reset self._emit_text("'''") - self._emit_tag_open("i", "''") - self._emit_all(stack) - self._emit_tag_close("i") + self._emit_style_tag("i", "''", stack) else: - self._emit_tag_open("b", "'''") - self._emit_tag_open("i", "''") - self._emit_all(stack) - self._emit_tag_close("i") + self._push() + self._emit_style_tag("i", "''", stack) self._emit_all(stack2) - self._emit_tag_close("b") + self._emit_style_tag("b", "'''", self._pop()) else: reset = self._head try: @@ -717,16 +706,12 @@ class Tokenizer(object): except BadRoute: self._head = reset self._emit_text("''") - self._emit_tag_open("b", "'''") - self._emit_all(stack) - self._emit_tag_close("b") + self._emit_style_tag("b", "'''", stack) else: - self._emit_tag_open("i", "''") - self._emit_tag_open("b", "'''") - self._emit_all(stack) - self._emit_tag_close("b") + self._push() + self._emit_style_tag("b", "'''", stack) self._emit_all(stack2) - self._emit_tag_close("i") + self._emit_style_tag("i", "''", self._pop()) def _parse_style(self): """Parse wiki-style formatting (``''``/``'''`` for italics/bold).""" From 9b98907751c28c48e0a2ff97583c26f371948128 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 20:55:35 -0400 Subject: [PATCH 20/27] Add C hooks and prototypes for wiki-markup tags. --- mwparserfromhell/parser/tokenizer.c | 227 ++++++++++++++++++++++++++++++++++-- mwparserfromhell/parser/tokenizer.h | 96 ++++++++------- 2 files changed, 273 insertions(+), 50 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index bae5ec2..be996ad 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -325,9 +325,10 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) */ static void* Tokenizer_fail_route(Tokenizer* self) { + int context = self->topstack->context; PyObject* stack = Tokenizer_pop(self); Py_XDECREF(stack); - FAIL_ROUTE(); + FAIL_ROUTE(context); return NULL; } @@ -1776,7 +1777,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) return -1; } if (!IS_SINGLE_ONLY(name)) - FAIL_ROUTE(); + FAIL_ROUTE(0); break; } Textbuffer_write(&buf, this); @@ -1823,12 +1824,201 @@ static int Tokenizer_parse_tag(Tokenizer* self) } /* + Write the body of a tag and the tokens that should surround it. +*/ +static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks, + PyObject* body) +{ + // self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + // self._emit_text(tag) + // self._emit(tokens.TagCloseOpen()) + // self._emit_all(body) + // self._emit(tokens.TagOpenClose()) + // self._emit_text(tag) + // self._emit(tokens.TagCloseClose()) +} + +/* + Parse wiki-style italics. +*/ +static int Tokenizer_parse_italics(Tokenizer* self) +{ + // reset = self._head + // try: + // stack = self._parse(contexts.STYLE_ITALICS) + // except BadRoute as route: + // self._head = reset + // if route.context & contexts.STYLE_PASS_AGAIN: + // stack = self._parse(route.context | contexts.STYLE_SECOND_PASS) + // else: + // return self._emit_text("''") + // self._emit_style_tag("i", "''", stack) +} + +/* + Parse wiki-style bold. +*/ +static int Tokenizer_parse_bold(Tokenizer* self) +{ + // reset = self._head + // try: + // stack = self._parse(contexts.STYLE_BOLD) + // except BadRoute: + // self._head = reset + // if self._context & contexts.STYLE_SECOND_PASS: + // self._emit_text("'") + // return True ## we can return 1 for this and -1 for errors (switch case) + // elif self._context & contexts.STYLE_ITALICS: + // self._context |= contexts.STYLE_PASS_AGAIN + // self._emit_text("'''") + // else: + // self._emit_text("'") + // self._parse_italics() + // else: + // self._emit_style_tag("b", "'''", stack) +} + +/* + Parse wiki-style italics and bold together (i.e., five ticks). +*/ +static int Tokenizer_parse_italics_and_bold(Tokenizer* self) +{ + // reset = self._head + // try: + // stack = self._parse(contexts.STYLE_BOLD) + // except BadRoute: + // self._head = reset + // try: + // stack = self._parse(contexts.STYLE_ITALICS) + // except BadRoute: + // self._head = reset + // self._emit_text("'''''") + // else: + // reset = self._head + // try: + // stack2 = self._parse(contexts.STYLE_BOLD) + // except BadRoute: + // self._head = reset + // self._emit_text("'''") + // self._emit_style_tag("i", "''", stack) + // else: + // self._push() + // self._emit_style_tag("i", "''", stack) + // self._emit_all(stack2) + // self._emit_style_tag("b", "'''", self._pop()) + // else: + // reset = self._head + // try: + // stack2 = self._parse(contexts.STYLE_ITALICS) + // except BadRoute: + // self._head = reset + // self._emit_text("''") + // self._emit_style_tag("b", "'''", stack) + // else: + // self._push() + // self._emit_style_tag("b", "'''", stack) + // self._emit_all(stack2) + // self._emit_style_tag("i", "''", self._pop()) +} + +/* + Parse wiki-style formatting (''/''' for italics/bold). +*/ +static PyObject* Tokenizer_parse_style(Tokenizer* self) +{ + // self._head += 2 + // ticks = 2 + // while self._read() == "'": + // self._head += 1 + // ticks += 1 + // italics = self._context & contexts.STYLE_ITALICS + // bold = self._context & contexts.STYLE_BOLD + // if ticks > 5: + // self._emit_text("'" * (ticks - 5)) + // ticks = 5 + // elif ticks == 4: + // self._emit_text("'") + // ticks = 3 + // if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)): + // if ticks == 5: + // self._head -= 3 if italics else 2 + // return self._pop() + // elif not self._can_recurse(): + // if ticks == 3: + // if self._context & contexts.STYLE_SECOND_PASS: + // self._emit_text("'") + // return self._pop() + // self._context |= contexts.STYLE_PASS_AGAIN + // self._emit_text("'" * ticks) + // elif ticks == 2: + // self._parse_italics() + // elif ticks == 3: + // if self._parse_bold(): + // return self._pop() + // elif ticks == 5: + // self._parse_italics_and_bold() + // self._head -= 1 + // ## we can return Py_None for non-error empty returns +} + +/* + Handle a list marker at the head (#, *, ;, :). +*/ +static int Tokenizer_handle_list_marker(Tokenizer* self) +{ + // markup = self._read() + // if markup == ";": + // self._context |= contexts.DL_TERM + // self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + // self._emit_text(get_html_tag(markup)) + // self._emit(tokens.TagCloseSelfclose()) +} + +/* + Handle a wiki-style list (#, *, ;, :). +*/ +static int Tokenizer_handle_list(Tokenizer* self) +{ + // self._handle_list_marker() + // while self._read(1) in ("#", "*", ";", ":"): + // self._head += 1 + // self._handle_list_marker() +} + +/* + Handle a wiki-style horizontal rule (----) in the string. +*/ +static int Tokenizer_handle_hr(Tokenizer* self) +{ + // length = 4 + // self._head += 3 + // while self._read(1) == "-": + // length += 1 + // self._head += 1 + // self._emit(tokens.TagOpenOpen(wiki_markup="-" * length)) + // self._emit_text("hr") + // self._emit(tokens.TagCloseSelfclose()) +} + +/* + Handle the term in a description list ('foo' in ';foo:bar'). +*/ +static int Tokenizer_handle_dl_term(Tokenizer* self) +{ + // self._context ^= contexts.DL_TERM + // if self._read() == ":": + // self._handle_list_marker() + // else: + // self._emit_text("\n") +} + +/* Handle the end of the stream of wikitext. */ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) { static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | - LC_HEADING | LC_COMMENT | LC_TAG); + LC_HEADING | LC_COMMENT | LC_TAG | LC_STYLE); static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE); PyObject *token, *text, *trash; int single; @@ -1943,7 +2133,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE); int this_context, is_marker, i; Py_UNICODE this, next, next_next, last; - PyObject* trash; + PyObject* temp; if (push) { if (Tokenizer_push(self, context)) @@ -1955,8 +2145,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (this_context & unsafe_contexts) { if (Tokenizer_verify_safe(self, this_context, this) < 0) { if (this_context & double_unsafe) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); + temp = Tokenizer_pop(self); + Py_XDECREF(temp); } return Tokenizer_fail_route(self); } @@ -1977,6 +2167,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (this == *"") return Tokenizer_handle_end(self, this_context); next = Tokenizer_READ(self, 1); + last = Tokenizer_READ_BACKWARDS(self, 1); if (this_context & LC_COMMENT) { if (this == next && next == *"-") { if (Tokenizer_READ(self, 2) == *">") @@ -2030,7 +2221,6 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) else if (this == next && next == *"]" && this_context & LC_WIKILINK) return Tokenizer_handle_wikilink_end(self); else if (this == *"=" && !(self->global & GL_HEADING)) { - last = Tokenizer_READ_BACKWARDS(self, 1); if (last == *"\n" || last == *"") { if (Tokenizer_parse_heading(self)) return NULL; @@ -2077,6 +2267,29 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) } else if (this == *">" && this_context & LC_TAG_CLOSE) return Tokenizer_handle_tag_close_close(self); + else if (this == next && next == *"'") { + temp = Tokenizer_parse_style(self); + if (temp) + return temp; + } + else if (last == *"\n" || last == *"") { + if (this == *"#" || this == *"*" || this == *";" || this == *":") { + if (Tokenizer_handle_list(self)) + return NULL; + } + else if (this == *"-" && this == next && + this == Tokenizer_READ(self, 2) && + this == Tokenizer_READ(self, 3)) { + if (Tokenizer_handle_hr(self)) + return NULL; + } + else if (Tokenizer_emit_text(self, this)) + return NULL; + } + else if ((this == *"\n" || this == *":") && this_context & LC_DLTERM) { + if (Tokenizer_handle_dl_term(self)) + return NULL; + } else if (Tokenizer_emit_text(self, this)) return NULL; self->head++; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index c42f5f9..29e8fbe 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -41,20 +41,21 @@ SOFTWARE. #define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" static const char* MARKERS[] = { - "{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-", - "\n", ""}; + "{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", ":", "/", + "-", "\n", ""}; -#define NUM_MARKERS 17 +#define NUM_MARKERS 18 #define TEXTBUFFER_BLOCKSIZE 1024 #define MAX_DEPTH 40 #define MAX_CYCLES 100000 #define MAX_BRACES 255 #define MAX_ENTITY_SIZE 8 -static int route_state = 0; -#define BAD_ROUTE (route_state) -#define FAIL_ROUTE() (route_state = 1) -#define RESET_ROUTE() (route_state = 0) +static int route_state = 0, route_context = 0; +#define BAD_ROUTE route_state +#define BAD_ROUTE_CONTEXT route_context +#define FAIL_ROUTE(context) route_state = 1; route_context = context +#define RESET_ROUTE() route_state = 0 static char** entitydefs; @@ -102,42 +103,50 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x000007 -#define LC_TEMPLATE_NAME 0x000001 -#define LC_TEMPLATE_PARAM_KEY 0x000002 -#define LC_TEMPLATE_PARAM_VALUE 0x000004 - -#define LC_ARGUMENT 0x000018 -#define LC_ARGUMENT_NAME 0x000008 -#define LC_ARGUMENT_DEFAULT 0x000010 - -#define LC_WIKILINK 0x000060 -#define LC_WIKILINK_TITLE 0x000020 -#define LC_WIKILINK_TEXT 0x000040 - -#define LC_HEADING 0x001F80 -#define LC_HEADING_LEVEL_1 0x000080 -#define LC_HEADING_LEVEL_2 0x000100 -#define LC_HEADING_LEVEL_3 0x000200 -#define LC_HEADING_LEVEL_4 0x000400 -#define LC_HEADING_LEVEL_5 0x000800 -#define LC_HEADING_LEVEL_6 0x001000 - -#define LC_COMMENT 0x002000 - -#define LC_TAG 0x03C000 -#define LC_TAG_OPEN 0x004000 -#define LC_TAG_ATTR 0x008000 -#define LC_TAG_BODY 0x010000 -#define LC_TAG_CLOSE 0x020000 - -#define LC_SAFETY_CHECK 0xFC0000 -#define LC_HAS_TEXT 0x040000 -#define LC_FAIL_ON_TEXT 0x080000 -#define LC_FAIL_NEXT 0x100000 -#define LC_FAIL_ON_LBRACE 0x200000 -#define LC_FAIL_ON_RBRACE 0x400000 -#define LC_FAIL_ON_EQUALS 0x800000 +#define LC_TEMPLATE 0x00000007 +#define LC_TEMPLATE_NAME 0x00000001 +#define LC_TEMPLATE_PARAM_KEY 0x00000002 +#define LC_TEMPLATE_PARAM_VALUE 0x00000004 + +#define LC_ARGUMENT 0x00000018 +#define LC_ARGUMENT_NAME 0x00000008 +#define LC_ARGUMENT_DEFAULT 0x00000010 + +#define LC_WIKILINK 0x00000060 +#define LC_WIKILINK_TITLE 0x00000020 +#define LC_WIKILINK_TEXT 0x00000040 + +#define LC_HEADING 0x00001F80 +#define LC_HEADING_LEVEL_1 0x00000080 +#define LC_HEADING_LEVEL_2 0x00000100 +#define LC_HEADING_LEVEL_3 0x00000200 +#define LC_HEADING_LEVEL_4 0x00000400 +#define LC_HEADING_LEVEL_5 0x00000800 +#define LC_HEADING_LEVEL_6 0x00001000 + +#define LC_COMMENT 0x00002000 + +#define LC_TAG 0x0003C000 +#define LC_TAG_OPEN 0x00004000 +#define LC_TAG_ATTR 0x00008000 +#define LC_TAG_BODY 0x00010000 +#define LC_TAG_CLOSE 0x00020000 + +#define LC_STYLE 0x003C0000 +#define LC_STYLE_ITALICS 0x00040000 +#define LC_STYLE_BOLD 0x00080000 +#define LC_STYLE_PASS_AGAIN 0x00100000 +#define LC_STYLE_SECOND_PASS 0x00200000 + +#define LC_DLTERM 0x00400000 + +#define LC_SAFETY_CHECK 0x1F800000 +#define LC_HAS_TEXT 0x00800000 +#define LC_FAIL_ON_TEXT 0x01000000 +#define LC_FAIL_NEXT 0x02000000 +#define LC_FAIL_ON_LBRACE 0x04000000 +#define LC_FAIL_ON_RBRACE 0x08000000 +#define LC_FAIL_ON_EQUALS 0x10000000 /* Global contexts: */ @@ -211,6 +220,7 @@ typedef struct { /* Macros for accessing HTML tag definitions: */ +#define GET_HTML_TAG(markup) (call_tag_def_func("get_html_tag", markup)) #define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) #define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) #define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) From 22d7ad032a9d8975fa0b67213d1db12a44227d72 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 Aug 2013 21:47:42 -0400 Subject: [PATCH 21/27] emit_style_tag, parse_italics, parse_bold --- mwparserfromhell/parser/tokenizer.c | 136 +++++++++++++++++++++++++++--------- 1 file changed, 103 insertions(+), 33 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index be996ad..ab0c0db 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1829,13 +1829,64 @@ static int Tokenizer_parse_tag(Tokenizer* self) static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks, PyObject* body) { - // self._emit(tokens.TagOpenOpen(wiki_markup=markup)) - // self._emit_text(tag) - // self._emit(tokens.TagCloseOpen()) - // self._emit_all(body) - // self._emit(tokens.TagOpenClose()) - // self._emit_text(tag) - // self._emit(tokens.TagCloseClose()) + PyObject *markup, *kwargs, *token; + char chr_markup[4]; + int i; + + for (i = 0; i < ticks; i++) chr_markup[i] = *"'"; + chr_markup[ticks] = *""; + markup = PyBytes_FromString(chr_markup); + if (!markup) + return -1; + kwargs = PyDict_New(); + if (!kwargs) { + Py_DECREF(markup); + return -1; + } + PyDict_SetItemString(kwargs, "wiki_markup", markup); + Py_DECREF(markup); + token = PyObject_Call(TagOpenOpen, NOARGS, kwargs); + if (!token) { + Py_DECREF(kwargs); + return -1; + } + Py_DECREF(kwargs); + if (Tokenizer_emit(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + if (Tokenizer_emit_text(self, tag)) + return -1; + token = PyObject_CallObject(TagCloseOpen, NULL); + if (!token) + return -1; + if (Tokenizer_emit(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + if (Tokenizer_emit_all(self, body)) + return -1; + token = PyObject_CallObject(TagOpenClose, NULL); + if (!token) + return -1; + if (Tokenizer_emit(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + if (Tokenizer_emit_text(self, tag)) + return -1; + token = PyObject_CallObject(TagCloseClose, NULL); + if (!token) + return -1; + if (Tokenizer_emit(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + return 0; } /* @@ -1843,16 +1894,27 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks, */ static int Tokenizer_parse_italics(Tokenizer* self) { - // reset = self._head - // try: - // stack = self._parse(contexts.STYLE_ITALICS) - // except BadRoute as route: - // self._head = reset - // if route.context & contexts.STYLE_PASS_AGAIN: - // stack = self._parse(route.context | contexts.STYLE_SECOND_PASS) - // else: - // return self._emit_text("''") - // self._emit_style_tag("i", "''", stack) + Py_ssize_t reset = self->head; + int context; + PyObject *stack; + + stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (BAD_ROUTE_CONTEXT & LC_STYLE_PASS_AGAIN) { + context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS; + stack = Tokenizer_parse(self, context, 1); + } + else { + if (Tokenizer_emit_text(self, *"'")) + return -1; + return Tokenizer_emit_text(self, *"'"); + } + } + if (!stack) + return -1; + return Tokenizer_emit_style_tag(self, *"i", 2, stack); } /* @@ -1860,22 +1922,30 @@ static int Tokenizer_parse_italics(Tokenizer* self) */ static int Tokenizer_parse_bold(Tokenizer* self) { - // reset = self._head - // try: - // stack = self._parse(contexts.STYLE_BOLD) - // except BadRoute: - // self._head = reset - // if self._context & contexts.STYLE_SECOND_PASS: - // self._emit_text("'") - // return True ## we can return 1 for this and -1 for errors (switch case) - // elif self._context & contexts.STYLE_ITALICS: - // self._context |= contexts.STYLE_PASS_AGAIN - // self._emit_text("'''") - // else: - // self._emit_text("'") - // self._parse_italics() - // else: - // self._emit_style_tag("b", "'''", stack) + Py_ssize_t reset = self->head; + PyObject *stack; + + stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (self->topstack->context & LC_STYLE_SECOND_PASS) + return Tokenizer_emit_text(self, *"'") ? -1 : 1; + if (self->topstack->context & LC_STYLE_ITALICS) { + self->topstack->context |= LC_STYLE_PASS_AGAIN; + if (Tokenizer_emit_text(self, *"'")) + return -1; + if (Tokenizer_emit_text(self, *"'")) + return -1; + return Tokenizer_emit_text(self, *"'"); + } + if (Tokenizer_emit_text(self, *"'")) + return -1; + return Tokenizer_parse_italics(self); + } + if (!stack) + return -1; + return Tokenizer_emit_style_tag(self, *"b", 3, stack); } /* From 34b37562858841afd0a5a322e51da0f51bf15c53 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 01:33:40 -0400 Subject: [PATCH 22/27] parse_italics_and_bold() and parse_style() --- mwparserfromhell/parser/tokenizer.c | 205 ++++++++++++++++++++++++------------ 1 file changed, 135 insertions(+), 70 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ab0c0db..137c3a5 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1886,6 +1886,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, char tag, int ticks, return -1; } Py_DECREF(token); + Py_DECREF(body); return 0; } @@ -1953,42 +1954,82 @@ static int Tokenizer_parse_bold(Tokenizer* self) */ static int Tokenizer_parse_italics_and_bold(Tokenizer* self) { - // reset = self._head - // try: - // stack = self._parse(contexts.STYLE_BOLD) - // except BadRoute: - // self._head = reset - // try: - // stack = self._parse(contexts.STYLE_ITALICS) - // except BadRoute: - // self._head = reset - // self._emit_text("'''''") - // else: - // reset = self._head - // try: - // stack2 = self._parse(contexts.STYLE_BOLD) - // except BadRoute: - // self._head = reset - // self._emit_text("'''") - // self._emit_style_tag("i", "''", stack) - // else: - // self._push() - // self._emit_style_tag("i", "''", stack) - // self._emit_all(stack2) - // self._emit_style_tag("b", "'''", self._pop()) - // else: - // reset = self._head - // try: - // stack2 = self._parse(contexts.STYLE_ITALICS) - // except BadRoute: - // self._head = reset - // self._emit_text("''") - // self._emit_style_tag("b", "'''", stack) - // else: - // self._push() - // self._emit_style_tag("b", "'''", stack) - // self._emit_all(stack2) - // self._emit_style_tag("i", "''", self._pop()) + Py_ssize_t reset = self->head; + PyObject *stack, *stack2; + + stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_text(self, *"'")) + return -1; + if (Tokenizer_emit_text(self, *"'")) + return -1; + if (Tokenizer_emit_text(self, *"'")) + return -1; + if (Tokenizer_emit_text(self, *"'")) + return -1; + return Tokenizer_emit_text(self, *"'"); + } + if (!stack) + return -1; + reset = self->head; + stack2 = Tokenizer_parse(self, LC_STYLE_BOLD, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_text(self, *"'")) + return -1; + if (Tokenizer_emit_text(self, *"'")) + return -1; + if (Tokenizer_emit_text(self, *"'")) + return -1; + return Tokenizer_emit_style_tag(self, *"i", 2, stack); + } + if (!stack2) + return -1; + if (Tokenizer_push(self, 0)) + return -1; + if (Tokenizer_emit_style_tag(self, *"i", 2, stack)) + return -1; + if (Tokenizer_emit_all(self, stack2)) + return -1; + Py_DECREF(stack2); + stack2 = Tokenizer_pop(self); + if (!stack2) + return -1; + return Tokenizer_emit_style_tag(self, *"b", 3, stack2); + } + if (!stack) + return -1; + reset = self->head; + stack2 = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_text(self, *"'")) + return -1; + if (Tokenizer_emit_text(self, *"'")) + return -1; + return Tokenizer_emit_style_tag(self, *"b", 3, stack); + } + if (!stack2) + return -1; + if (Tokenizer_push(self, 0)) + return -1; + if (Tokenizer_emit_style_tag(self, *"b", 3, stack)) + return -1; + if (Tokenizer_emit_all(self, stack2)) + return -1; + Py_DECREF(stack2); + stack2 = Tokenizer_pop(self); + if (!stack2) + return -1; + return Tokenizer_emit_style_tag(self, *"i", 2, stack2); } /* @@ -1996,39 +2037,63 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self) */ static PyObject* Tokenizer_parse_style(Tokenizer* self) { - // self._head += 2 - // ticks = 2 - // while self._read() == "'": - // self._head += 1 - // ticks += 1 - // italics = self._context & contexts.STYLE_ITALICS - // bold = self._context & contexts.STYLE_BOLD - // if ticks > 5: - // self._emit_text("'" * (ticks - 5)) - // ticks = 5 - // elif ticks == 4: - // self._emit_text("'") - // ticks = 3 - // if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)): - // if ticks == 5: - // self._head -= 3 if italics else 2 - // return self._pop() - // elif not self._can_recurse(): - // if ticks == 3: - // if self._context & contexts.STYLE_SECOND_PASS: - // self._emit_text("'") - // return self._pop() - // self._context |= contexts.STYLE_PASS_AGAIN - // self._emit_text("'" * ticks) - // elif ticks == 2: - // self._parse_italics() - // elif ticks == 3: - // if self._parse_bold(): - // return self._pop() - // elif ticks == 5: - // self._parse_italics_and_bold() - // self._head -= 1 - // ## we can return Py_None for non-error empty returns + int context = self->topstack->context, ticks = 2, i; + + self->head += 2; + while (Tokenizer_READ(self, 0) == *"'") { + self->head++; + ticks++; + } + if (ticks > 5) { + for (i = 0; i < ticks - 5; i++) { + if (Tokenizer_emit_text(self, *"'")) + return NULL; + } + ticks = 5; + } + else if (ticks == 4) { + if (Tokenizer_emit_text(self, *"'")) + return NULL; + ticks = 3; + } + if ((context & LC_STYLE_ITALICS && (ticks == 2 || ticks == 5)) || + (context & LC_STYLE_BOLD && (ticks == 3 || ticks == 5))) { + if (ticks == 5) + self->head -= context & LC_STYLE_ITALICS ? 3 : 2; + return Tokenizer_pop(self); + } + if (!Tokenizer_CAN_RECURSE(self)) { + if (ticks == 3) { + if (context & LC_STYLE_SECOND_PASS) { + if (Tokenizer_emit_text(self, *"'")) + return NULL; + return Tokenizer_pop(self); + } + self->topstack->context |= LC_STYLE_PASS_AGAIN; + } + for (i = 0; i < ticks; i++) { + if (Tokenizer_emit_text(self, *"'")) + return NULL; + } + } + else if (ticks == 2) { + if (Tokenizer_parse_italics(self)) + return NULL; + } + else if (ticks == 3) { + switch (Tokenizer_parse_bold(self)) { + case 1: + return Tokenizer_pop(self); + case -1: + return NULL; + } + } + else { + if (Tokenizer_parse_italics_and_bold(self)) + return NULL; + } + self->head--; + return Py_None; } /* @@ -2339,7 +2404,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return Tokenizer_handle_tag_close_close(self); else if (this == next && next == *"'") { temp = Tokenizer_parse_style(self); - if (temp) + if (temp != Py_None) return temp; } else if (last == *"\n" || last == *"") { From c20d3f2a6af7a0ad866c0788b9b1b91badb05571 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 02:01:13 -0400 Subject: [PATCH 23/27] handle_list_marker() and handle_list() --- mwparserfromhell/parser/tokenizer.c | 59 ++++++++++++++++++++++++++++++------- mwparserfromhell/parser/tokenizer.h | 2 +- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 137c3a5..f5e1f27 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2101,12 +2101,43 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self) */ static int Tokenizer_handle_list_marker(Tokenizer* self) { - // markup = self._read() - // if markup == ";": - // self._context |= contexts.DL_TERM - // self._emit(tokens.TagOpenOpen(wiki_markup=markup)) - // self._emit_text(get_html_tag(markup)) - // self._emit(tokens.TagCloseSelfclose()) + PyObject *markup = Tokenizer_read(self, 0), *kwargs, *token; + Py_UNICODE code = *PyUnicode_AS_UNICODE(markup); + char *html; + int i = 0; + + if (code == *";") + self->topstack->context |= LC_DLTERM; + kwargs = PyDict_New(); + if (!kwargs) + return -1; + PyDict_SetItemString(kwargs, "wiki_markup", markup); + token = PyObject_Call(TagOpenOpen, NOARGS, kwargs); + if (!token) { + Py_DECREF(kwargs); + return -1; + } + Py_DECREF(kwargs); + if (Tokenizer_emit(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + html = GET_HTML_TAG(code); + while (html[i]) { + if (Tokenizer_emit_text(self, html[i])) + return -1; + i++; + } + token = PyObject_CallObject(TagCloseSelfclose, NULL); + if (!token) + return -1; + if (Tokenizer_emit(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + return 0; } /* @@ -2114,10 +2145,18 @@ static int Tokenizer_handle_list_marker(Tokenizer* self) */ static int Tokenizer_handle_list(Tokenizer* self) { - // self._handle_list_marker() - // while self._read(1) in ("#", "*", ";", ":"): - // self._head += 1 - // self._handle_list_marker() + Py_UNICODE marker = Tokenizer_READ(self, 1); + + if (Tokenizer_handle_list_marker(self)) + return -1; + while (marker == *"#" || marker == *"*" || marker == *";" || + marker == *":") { + self->head++; + if (Tokenizer_handle_list_marker(self)) + return -1; + marker = Tokenizer_READ(self, 1); + } + return 0; } /* diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 29e8fbe..4136285 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -220,7 +220,7 @@ typedef struct { /* Macros for accessing HTML tag definitions: */ -#define GET_HTML_TAG(markup) (call_tag_def_func("get_html_tag", markup)) +#define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") #define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) #define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) #define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) From 9993ffe8bf4ca38d6c6ff47f348c0962dc511917 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 02:11:16 -0400 Subject: [PATCH 24/27] handle_hr() --- mwparserfromhell/parser/tokenizer.c | 57 +++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index f5e1f27..5eaa6d8 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2164,14 +2164,55 @@ static int Tokenizer_handle_list(Tokenizer* self) */ static int Tokenizer_handle_hr(Tokenizer* self) { - // length = 4 - // self._head += 3 - // while self._read(1) == "-": - // length += 1 - // self._head += 1 - // self._emit(tokens.TagOpenOpen(wiki_markup="-" * length)) - // self._emit_text("hr") - // self._emit(tokens.TagCloseSelfclose()) + PyObject *markup, *kwargs, *token; + Textbuffer *buffer = Textbuffer_new(); + int i; + + if (!buffer) + return -1; + self->head += 3; + for (i = 0; i < 4; i++) { + if (Textbuffer_write(&buffer, *"-")) + return -1; + } + while (Tokenizer_READ(self, 1) == *"-") { + if (Textbuffer_write(&buffer, *"-")) + return -1; + self->head++; + } + markup = Textbuffer_render(buffer); + if (!markup) + return -1; + Textbuffer_dealloc(buffer); + kwargs = PyDict_New(); + if (!kwargs) + return -1; + PyDict_SetItemString(kwargs, "wiki_markup", markup); + Py_DECREF(markup); + token = PyObject_Call(TagOpenOpen, NOARGS, kwargs); + if (!token) { + Py_DECREF(kwargs); + return -1; + } + Py_DECREF(kwargs); + if (Tokenizer_emit(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + if (Tokenizer_emit_text(self, *"h")) + return -1; + if (Tokenizer_emit_text(self, *"r")) + return -1; + token = PyObject_CallObject(TagCloseSelfclose, NULL); + if (!token) + return -1; + if (Tokenizer_emit(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + return 0; } /* From bbcb906f37a30c22f91b6661c35138da3a18d868 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 02:12:33 -0400 Subject: [PATCH 25/27] handle_dl_term() --- mwparserfromhell/parser/tokenizer.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 5eaa6d8..43df293 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2220,11 +2220,10 @@ static int Tokenizer_handle_hr(Tokenizer* self) */ static int Tokenizer_handle_dl_term(Tokenizer* self) { - // self._context ^= contexts.DL_TERM - // if self._read() == ":": - // self._handle_list_marker() - // else: - // self._emit_text("\n") + self->topstack->context ^= LC_DLTERM; + if (Tokenizer_READ(self, 0) == *":") + return Tokenizer_handle_list_marker(self); + return Tokenizer_emit_text(self, *"\n"); } /* From c1379d5f21f1f5bfd4bb7a179994225e487519ad Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 02:33:15 -0400 Subject: [PATCH 26/27] Add a emit_string() as a shortcut; a bunch of minor cleanup. --- mwparserfromhell/parser/tokenizer.c | 143 +++++++++++++++--------------------- 1 file changed, 60 insertions(+), 83 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 43df293..62e8599 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -29,6 +29,7 @@ SOFTWARE. static int heading_level_from_context(int n) { int level; + n /= LC_HEADING_LEVEL_1; for (level = 1; n > 1; n >>= 1) level++; @@ -72,6 +73,7 @@ static PyObject* strip_tag_name(PyObject* token) static Textbuffer* Textbuffer_new(void) { Textbuffer* buffer = malloc(sizeof(Textbuffer)); + if (!buffer) { PyErr_NoMemory(); return NULL; @@ -90,6 +92,7 @@ static Textbuffer* Textbuffer_new(void) static void Textbuffer_dealloc(Textbuffer* self) { Textbuffer* next; + while (self) { free(self->data); next = self->next; @@ -104,6 +107,7 @@ static void Textbuffer_dealloc(Textbuffer* self) static int Textbuffer_write(Textbuffer** this, Py_UNICODE text) { Textbuffer* self = *this; + if (self->size == TEXTBUFFER_BLOCKSIZE) { Textbuffer* new = Textbuffer_new(); if (!new) @@ -123,6 +127,7 @@ static PyObject* Textbuffer_render(Textbuffer* self) { PyObject *result = PyUnicode_FromUnicode(self->data, self->size); PyObject *left, *concat; + while (self->next) { self = self->next; left = PyUnicode_FromUnicode(self->data, self->size); @@ -208,6 +213,7 @@ static void Tokenizer_dealloc(Tokenizer* self) static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) { static char* kwlist[] = {NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) return -1; self->text = Py_None; @@ -223,6 +229,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) static int Tokenizer_push(Tokenizer* self, int context) { Stack* top = malloc(sizeof(Stack)); + if (!top) { PyErr_NoMemory(); return -1; @@ -246,6 +253,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self) { PyObject *text, *kwargs, *token; Textbuffer* buffer = self->topstack->textbuffer; + if (buffer->size == 0 && !buffer->next) return 0; text = Textbuffer_render(buffer); @@ -280,6 +288,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self) static void Tokenizer_delete_top_of_stack(Tokenizer* self) { Stack* top = self->topstack; + Py_DECREF(top->stack); Textbuffer_dealloc(top->textbuffer); self->topstack = top->next; @@ -293,6 +302,7 @@ static void Tokenizer_delete_top_of_stack(Tokenizer* self) static PyObject* Tokenizer_pop(Tokenizer* self) { PyObject* stack; + if (Tokenizer_push_textbuffer(self)) return NULL; stack = self->topstack->stack; @@ -309,6 +319,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) { PyObject* stack; int context; + if (Tokenizer_push_textbuffer(self)) return NULL; stack = self->topstack->stack; @@ -327,6 +338,7 @@ static void* Tokenizer_fail_route(Tokenizer* self) { int context = self->topstack->context; PyObject* stack = Tokenizer_pop(self); + Py_XDECREF(stack); FAIL_ROUTE(context); return NULL; @@ -365,6 +377,21 @@ static int Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text) } /* + Write a string of text to the current textbuffer. +*/ +static int Tokenizer_emit_string(Tokenizer* self, const char* text) +{ + int i = 0; + + while (text[i]) { + if (Tokenizer_emit_text(self, text[i])) + return -1; + i++; + } + return 0; +} + +/* Write a series of tokens to the current stack at once. */ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) @@ -428,15 +455,10 @@ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) { PyObject* stack = Tokenizer_pop(self); - int i = 0; - while (1) { - if (!text[i]) - break; - if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) { - Py_XDECREF(stack); - return -1; - } - i++; + + if (Tokenizer_emit_string(self, text)) { + Py_DECREF(stack); + return -1; } if (stack) { if (PyList_GET_SIZE(stack) > 0) { @@ -457,6 +479,7 @@ static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) { Py_ssize_t index = self->head + delta; + if (index >= self->length) return EMPTY; return PyList_GET_ITEM(self->text, index); @@ -468,6 +491,7 @@ static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) { Py_ssize_t index; + if (delta > self->head) return EMPTY; index = self->head - delta; @@ -752,7 +776,6 @@ static int Tokenizer_parse_wikilink(Tokenizer* self) { Py_ssize_t reset; PyObject *wikilink, *token; - int i; self->head += 2; reset = self->head - 1; @@ -760,10 +783,8 @@ static int Tokenizer_parse_wikilink(Tokenizer* self) if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; - for (i = 0; i < 2; i++) { - if (Tokenizer_emit_text(self, *"[")) - return -1; - } + if (Tokenizer_emit_string(self, "[[")) + return -1; return 0; } if (!wikilink) @@ -1183,24 +1204,14 @@ static int Tokenizer_parse_comment(Tokenizer* self) { Py_ssize_t reset = self->head + 3; PyObject *token, *comment; - int i; self->head += 4; comment = Tokenizer_parse(self, LC_COMMENT, 1); if (BAD_ROUTE) { - const char* text = "