From cdc97f6f296e1f28bd5da24ee1c34fc7299d5a88 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 04:40:26 -0400 Subject: [PATCH 1/9] Version bump for 0.6 [ci skip] --- CHANGELOG | 4 ++++ appveyor.yml | 2 +- docs/changelog.rst | 8 ++++++++ mwparserfromhell/__init__.py | 2 +- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index bdcf906..f1377a8 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +v0.6 (unreleased): + +- ... + v0.5 (released June 23, 2017): - Added Wikicode.contains() to determine whether a Node or Wikicode object is diff --git a/appveyor.yml b/appveyor.yml index ff2ef4a..70b71b4 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1,6 @@ # This config file is used by appveyor.com to build Windows release binaries -version: 0.5-b{build} +version: 0.6.dev0-b{build} branches: only: diff --git a/docs/changelog.rst b/docs/changelog.rst index cf4e31a..b0c49ab 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,14 @@ Changelog ========= +v0.6 +---- + +Unreleased +(`changes `__): + +- ... + v0.5 ---- diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 17f9e97..20370ac 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015, 2016 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.5" +__version__ = "0.6.dev0" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin, From 46fbd85c2e2effb697caab211d5432c80c823076 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 04:41:32 -0400 Subject: [PATCH 2/9] Typo [ci skip] --- docs/changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index b0c49ab..192eeca 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -5,7 +5,7 @@ v0.6 ---- Unreleased -(`changes `__): +(`changes `__): - ... From 7a30e47f767c58bf6ff20fdb9e2e4a5f12d2ac8f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 15 Aug 2017 18:11:34 -0400 Subject: [PATCH 3/9] Some improvements to whitespace recognition; unit tests (#185). --- CHANGELOG | 3 +- docs/changelog.rst | 3 +- mwparserfromhell/__init__.py | 2 +- mwparserfromhell/nodes/template.py | 4 +- tests/test_template.py | 229 ++++++++++++++++++++++++++++++++++++- 5 files changed, 235 insertions(+), 6 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f1377a8..d3a2b2b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ v0.6 (unreleased): -- ... +- Improved behavior when adding parameters to templates (via Template.add()) + with poorly formatted whitespace conventions. v0.5 (released June 23, 2017): diff --git a/docs/changelog.rst b/docs/changelog.rst index 192eeca..841f04a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,7 +7,8 @@ v0.6 Unreleased (`changes `__): -- ... +- Improved behavior when adding parameters to templates (via + :meth:`.Template.add`) with poorly formatted whitespace conventions. v0.5 ---- diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 20370ac..1a9c542 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2012, 2013, 2014, 2015, 2016 Ben Kurtovic" +__copyright__ = "Copyright (C) 2012-2017 Ben Kurtovic" __license__ = "MIT License" __version__ = "0.6.dev0" __email__ = "ben.kurtovic@gmail.com" diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 58d25ae..9c058d4 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -101,7 +101,7 @@ class Template(Node): values = tuple(theories.values()) best = max(values) confidence = float(best) / sum(values) - if confidence >= 0.75: + if confidence > 0.5: return tuple(theories.keys())[values.index(best)] @staticmethod @@ -130,6 +130,8 @@ class Template(Node): before_theories = defaultdict(lambda: 0) after_theories = defaultdict(lambda: 0) for param in self.params: + if not param.showkey: + continue if use_names: component = str(param.name) else: diff --git a/tests/test_template.py b/tests/test_template.py index 76a45cf..5d55b98 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -30,6 +30,8 @@ except ImportError: from mwparserfromhell.compat import str from mwparserfromhell.nodes import HTMLEntity, Template, Text from mwparserfromhell.nodes.extras import Parameter +from mwparserfromhell import parse + from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext pgens = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=True) @@ -287,7 +289,7 @@ class TestTemplate(TreeEqualityTestCase): self.assertIsInstance(node12.params[1].value.get(1), HTMLEntity) self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|\nh = i}}", node13) self.assertEqual("{{a\n|b =c\n|d = e|f =g\n|h = i\n|j =k\n}}", node14) - self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |h =i}}", node15) + self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |\nh = i}}", node15) self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|h=i}}", node16) self.assertEqual("{{a|b|c}}", node17) self.assertEqual("{{a|b|3=c}}", node18) @@ -439,5 +441,228 @@ class TestTemplate(TreeEqualityTestCase): self.assertEqual("{{foo|a=b|c=d|e=f|a=|a=b}}", node26) self.assertRaises(ValueError, node27.remove, node28.get(1)) + def test_formatting(self): + """test realistic param manipulation with complex whitespace formatting + (assumes that parsing works correctly)""" + tests = [ + # https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004 + ("""{{Infobox U.S. county +| county = Lamar County +| state = Georgia +| seal = +| founded = 1920 +| seat wl = Barnesville +| largest city wl = Barnesville +| area_total_sq_mi = 186 +| area_land_sq_mi = 184 +| area_water_sq_mi = 2.3 +| area percentage = 1.3% +| census yr = 2010 +| pop = 18317 +| density_sq_mi = 100 +| time zone = Eastern +| footnotes = +| web = www.lamarcountyga.com +| ex image = Lamar County Georgia Courthouse.jpg +| ex image cap = Lamar County courthouse in Barnesville +| district = 3rd +| named for = [[Lucius Quintus Cincinnatus Lamar II]] +}}""", """{{Infobox U.S. county +| county = Lamar County +| state = Georgia +| seal = +| founded = 1920 +| seat wl = Barnesville +| largest city wl = Barnesville +| area_total_sq_mi = 186 +| area_land_sq_mi = 184 +| area_water_sq_mi = 2.3 +| area percentage = 1.3% +| census estimate yr = 2016 +| pop = 12345example ref +| density_sq_mi = 100 +| time zone = Eastern +| footnotes = +| web = www.lamarcountyga.com +| ex image = Lamar County Georgia Courthouse.jpg +| ex image cap = Lamar County courthouse in Barnesville +| district = 3rd +| named for = [[Lucius Quintus Cincinnatus Lamar II]] +}}"""), + # https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760 + ("""{{Infobox U.S. County| + county = Rockdale County | + state = Georgia | + seal = | + founded = October 18, 1870 | + seat wl = Conyers | + largest city wl = Conyers | + area_total_sq_mi = 132 | + area_land_sq_mi = 130 | + area_water_sq_mi = 2.3 | + area percentage = 1.7% | + census yr = 2010| + pop = 85215 | + density_sq_mi = 657 | + web = www.rockdalecounty.org +| ex image = Rockdale-county-courthouse.jpg +| ex image cap = Rockdale County Courthouse in Conyers +| district = 4th +| time zone= Eastern +}}""", """{{Infobox U.S. County| + county = Rockdale County | + state = Georgia | + seal = | + founded = October 18, 1870 | + seat wl = Conyers | + largest city wl = Conyers | + area_total_sq_mi = 132 | + area_land_sq_mi = 130 | + area_water_sq_mi = 2.3 | + area percentage = 1.7% | + census estimate yr = 2016 | + pop = 12345example ref | + density_sq_mi = 657 | + web = www.rockdalecounty.org +| ex image = Rockdale-county-courthouse.jpg +| ex image cap = Rockdale County Courthouse in Conyers +| district = 4th +| time zone= Eastern +}}"""), + # https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413 + ("""{{Infobox U.S. County| +| county = Spalding County | +| state = Georgia | +| seal = | +| founded = 1851 | +| seat wl = Griffin | +| largest city wl = Griffin | +| area_total_sq_mi = 200 | +| area_land_sq_mi = 196 | +| area_water_sq_mi = 3.1 | +| area percentage = 1.6% | +| census yr = 2010| +| pop = 64073 | +| density_sq_mi = 326 | +| web = www.spaldingcounty.com | +| named for = [[Thomas Spalding]] +| ex image = Spalding County Courthouse (NE corner).JPG +| ex image cap = Spalding County Courthouse in Griffin +| district = 3rd +| time zone = Eastern +}}""", """{{Infobox U.S. County| +| county = Spalding County | +| state = Georgia | +| seal = | +| founded = 1851 | +| seat wl = Griffin | +| largest city wl = Griffin | +| area_total_sq_mi = 200 | +| area_land_sq_mi = 196 | +| area_water_sq_mi = 3.1 | +| area percentage = 1.6% | +| +| census estimate yr = 2016 | pop = 12345example ref | +| density_sq_mi = 326 | +| web = www.spaldingcounty.com | +| named for = [[Thomas Spalding]] +| ex image = Spalding County Courthouse (NE corner).JPG +| ex image cap = Spalding County Courthouse in Griffin +| district = 3rd +| time zone = Eastern +}}"""), + # https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648 + ("""{{Infobox U.S. county + |county = Clinton County + |state = Illinois +| ex image = File:Clinton County Courthouse, Carlyle.jpg +| ex image cap = [[Clinton County Courthouse (Illinois)|Clinton County Courthouse]] + |seal = + |founded = 1824 + |named for = [[DeWitt Clinton]] + |seat wl= Carlyle +| largest city wl = Breese + |time zone=Central + |area_total_sq_mi = 503 + |area_land_sq_mi = 474 + |area_water_sq_mi = 29 + |area percentage = 5.8% + |census yr = 2010 + |pop = 37762 + |density_sq_mi = 80 + |web = www.clintonco.illinois.gov +| district = 15th +}}""", """{{Infobox U.S. county + |county = Clinton County + |state = Illinois +| ex image = File:Clinton County Courthouse, Carlyle.jpg +| ex image cap = [[Clinton County Courthouse (Illinois)|Clinton County Courthouse]] + |seal = + |founded = 1824 + |named for = [[DeWitt Clinton]] + |seat wl= Carlyle +| largest city wl = Breese + |time zone=Central + |area_total_sq_mi = 503 + |area_land_sq_mi = 474 + |area_water_sq_mi = 29 + |area percentage = 5.8% + |census estimate yr = 2016 + |pop = 12345example ref + |density_sq_mi = 80 + |web = www.clintonco.illinois.gov +| district = 15th +}}"""), + # https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800 + ("""{{Infobox U.S. county | + county = Winnebago County | + state = Illinois | + seal = Winnebago County il seal.png | + named for = [[Winnebago (tribe)|Winnebago Tribe]] | + seat wl= Rockford | + largest city wl = Rockford| + area_total_sq_mi = 519 | + area_land_sq_mi = 513| + area_water_sq_mi = 5.9 | + area percentage = 1.1% | + census yr = 2010| + pop = 295266 | + density_sq_mi = 575 +| web = www.wincoil.us +| founded year = 1836 +| founded date = January 16 +| time zone = Central +| district = 16th +| district2 = 17th +}}""", """{{Infobox U.S. county | + county = Winnebago County | + state = Illinois | + seal = Winnebago County il seal.png | + named for = [[Winnebago (tribe)|Winnebago Tribe]] | + seat wl= Rockford | + largest city wl = Rockford| + area_total_sq_mi = 519 | + area_land_sq_mi = 513| + area_water_sq_mi = 5.9 | + area percentage = 1.1% | + census estimate yr = 2016| + pop = 12345example ref | + density_sq_mi = 575 +| web = www.wincoil.us +| founded year = 1836 +| founded date = January 16 +| time zone = Central +| district = 16th +| district2 = 17th +}}""")] + + for (original, expected) in tests: + code = parse(original) + template = code.filter_templates()[0] + template.add("pop", "12345example ref") + template.add('census estimate yr', "2016", before="pop") + template.remove("census yr") + self.assertEqual(expected, str(code)) + if __name__ == "__main__": unittest.main(verbosity=2) From 253102be35b974c968bbecaa854aae3d7e27c67e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 22 Aug 2017 23:46:53 -0400 Subject: [PATCH 4/9] Minor change to template test_formatting format. --- tests/test_template.py | 157 ++++++++++++++++--------------------------------- 1 file changed, 51 insertions(+), 106 deletions(-) diff --git a/tests/test_template.py b/tests/test_template.py index 5d55b98..a53d5d2 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -21,6 +21,7 @@ # SOFTWARE. from __future__ import unicode_literals +from difflib import unified_diff try: import unittest2 as unittest @@ -467,28 +468,15 @@ class TestTemplate(TreeEqualityTestCase): | ex image cap = Lamar County courthouse in Barnesville | district = 3rd | named for = [[Lucius Quintus Cincinnatus Lamar II]] -}}""", """{{Infobox U.S. county -| county = Lamar County -| state = Georgia -| seal = -| founded = 1920 -| seat wl = Barnesville -| largest city wl = Barnesville -| area_total_sq_mi = 186 -| area_land_sq_mi = 184 -| area_water_sq_mi = 2.3 -| area percentage = 1.3% -| census estimate yr = 2016 -| pop = 12345example ref -| density_sq_mi = 100 -| time zone = Eastern -| footnotes = -| web = www.lamarcountyga.com -| ex image = Lamar County Georgia Courthouse.jpg -| ex image cap = Lamar County courthouse in Barnesville -| district = 3rd -| named for = [[Lucius Quintus Cincinnatus Lamar II]] -}}"""), +}}""", + """@@ -11,4 +11,4 @@ + | area percentage = 1.3% +-| census yr = 2010 +-| pop = 18317 ++| census estimate yr = 2016 ++| pop = 12345example ref + | density_sq_mi = 100"""), + # https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760 ("""{{Infobox U.S. County| county = Rockdale County | @@ -509,26 +497,15 @@ class TestTemplate(TreeEqualityTestCase): | ex image cap = Rockdale County Courthouse in Conyers | district = 4th | time zone= Eastern -}}""", """{{Infobox U.S. County| - county = Rockdale County | - state = Georgia | - seal = | - founded = October 18, 1870 | - seat wl = Conyers | - largest city wl = Conyers | - area_total_sq_mi = 132 | - area_land_sq_mi = 130 | - area_water_sq_mi = 2.3 | - area percentage = 1.7% | - census estimate yr = 2016 | - pop = 12345example ref | - density_sq_mi = 657 | - web = www.rockdalecounty.org -| ex image = Rockdale-county-courthouse.jpg -| ex image cap = Rockdale County Courthouse in Conyers -| district = 4th -| time zone= Eastern -}}"""), +}}""", + """@@ -11,4 +11,4 @@ + area percentage = 1.7% | +- census yr = 2010| +- pop = 85215 | ++ census estimate yr = 2016 | ++ pop = 12345example ref | + density_sq_mi = 657 |"""), + # https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413 ("""{{Infobox U.S. County| | county = Spalding County | @@ -550,27 +527,15 @@ class TestTemplate(TreeEqualityTestCase): | ex image cap = Spalding County Courthouse in Griffin | district = 3rd | time zone = Eastern -}}""", """{{Infobox U.S. County| -| county = Spalding County | -| state = Georgia | -| seal = | -| founded = 1851 | -| seat wl = Griffin | -| largest city wl = Griffin | -| area_total_sq_mi = 200 | -| area_land_sq_mi = 196 | -| area_water_sq_mi = 3.1 | -| area percentage = 1.6% | -| -| census estimate yr = 2016 | pop = 12345example ref | -| density_sq_mi = 326 | -| web = www.spaldingcounty.com | -| named for = [[Thomas Spalding]] -| ex image = Spalding County Courthouse (NE corner).JPG -| ex image cap = Spalding County Courthouse in Griffin -| district = 3rd -| time zone = Eastern -}}"""), +}}""", + """@@ -11,4 +11,4 @@ + | area percentage = 1.6% | +-| census yr = 2010| +-| pop = 64073 | ++| ++| census estimate yr = 2016 | pop = 12345example ref | + | density_sq_mi = 326 |"""), + # https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648 ("""{{Infobox U.S. county |county = Clinton County @@ -592,27 +557,15 @@ class TestTemplate(TreeEqualityTestCase): |density_sq_mi = 80 |web = www.clintonco.illinois.gov | district = 15th -}}""", """{{Infobox U.S. county - |county = Clinton County - |state = Illinois -| ex image = File:Clinton County Courthouse, Carlyle.jpg -| ex image cap = [[Clinton County Courthouse (Illinois)|Clinton County Courthouse]] - |seal = - |founded = 1824 - |named for = [[DeWitt Clinton]] - |seat wl= Carlyle -| largest city wl = Breese - |time zone=Central - |area_total_sq_mi = 503 - |area_land_sq_mi = 474 - |area_water_sq_mi = 29 - |area percentage = 5.8% - |census estimate yr = 2016 - |pop = 12345example ref - |density_sq_mi = 80 - |web = www.clintonco.illinois.gov -| district = 15th -}}"""), +}}""", + """@@ -15,4 +15,4 @@ + |area percentage = 5.8% +- |census yr = 2010 +- |pop = 37762 ++ |census estimate yr = 2016 ++ |pop = 12345example ref + |density_sq_mi = 80"""), + # https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800 ("""{{Infobox U.S. county | county = Winnebago County | @@ -634,27 +587,14 @@ class TestTemplate(TreeEqualityTestCase): | time zone = Central | district = 16th | district2 = 17th -}}""", """{{Infobox U.S. county | - county = Winnebago County | - state = Illinois | - seal = Winnebago County il seal.png | - named for = [[Winnebago (tribe)|Winnebago Tribe]] | - seat wl= Rockford | - largest city wl = Rockford| - area_total_sq_mi = 519 | - area_land_sq_mi = 513| - area_water_sq_mi = 5.9 | - area percentage = 1.1% | - census estimate yr = 2016| - pop = 12345example ref | - density_sq_mi = 575 -| web = www.wincoil.us -| founded year = 1836 -| founded date = January 16 -| time zone = Central -| district = 16th -| district2 = 17th -}}""")] +}}""", + """@@ -11,4 +11,4 @@ + area percentage = 1.1% | +- census yr = 2010| +- pop = 295266 | ++ census estimate yr = 2016| ++ pop = 12345example ref | + density_sq_mi = 575""")] for (original, expected) in tests: code = parse(original) @@ -662,7 +602,12 @@ class TestTemplate(TreeEqualityTestCase): template.add("pop", "12345example ref") template.add('census estimate yr', "2016", before="pop") template.remove("census yr") - self.assertEqual(expected, str(code)) + + oldlines = original.splitlines(keepends=True) + newlines = str(code).splitlines(keepends=True) + difflines = unified_diff(oldlines, newlines, n=1) + diff = "".join(list(difflines)[2:]).strip() + self.assertEqual(expected, diff) if __name__ == "__main__": unittest.main(verbosity=2) From 46000ee7c8b331dfa0eb9e454b26414571f76954 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 23 Aug 2017 02:15:37 -0400 Subject: [PATCH 5/9] Fix test on old Python versions --- tests/test_template.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_template.py b/tests/test_template.py index a53d5d2..5b939f0 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -603,8 +603,8 @@ class TestTemplate(TreeEqualityTestCase): template.add('census estimate yr', "2016", before="pop") template.remove("census yr") - oldlines = original.splitlines(keepends=True) - newlines = str(code).splitlines(keepends=True) + oldlines = original.splitlines(True) + newlines = str(code).splitlines(True) difflines = unified_diff(oldlines, newlines, n=1) diff = "".join(list(difflines)[2:]).strip() self.assertEqual(expected, diff) From 86c805d59b835146e792504550da860f95b11c9a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 25 Feb 2018 02:33:15 -0500 Subject: [PATCH 6/9] Don't get stuck in tags with unclosed quoted attributes (fixes #190). --- CHANGELOG | 4 ++- LICENSE | 2 +- docs/changelog.rst | 4 +++ docs/conf.py | 2 +- mwparserfromhell/__init__.py | 4 +-- mwparserfromhell/parser/ctokenizer/tok_parse.c | 16 +++++++++--- mwparserfromhell/parser/ctokenizer/tok_support.c | 28 +++++++++++++------- mwparserfromhell/parser/ctokenizer/tok_support.h | 3 ++- mwparserfromhell/parser/tokenizer.py | 33 +++++++++++++++++++----- 9 files changed, 71 insertions(+), 25 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index d3a2b2b..ebe4d7d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,9 @@ v0.6 (unreleased): - Improved behavior when adding parameters to templates (via Template.add()) - with poorly formatted whitespace conventions. + with poorly formatted whitespace conventions. (#185) +- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, + quoted attributes. (#190) v0.5 (released June 23, 2017): diff --git a/LICENSE b/LICENSE index 588e737..f353cd7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/changelog.rst b/docs/changelog.rst index 841f04a..7aa8f22 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,10 @@ Unreleased - Improved behavior when adding parameters to templates (via :meth:`.Template.add`) with poorly formatted whitespace conventions. + (`#185 `_) +- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, + quoted attributes. + (`#190 `_) v0.5 ---- diff --git a/docs/conf.py b/docs/conf.py index 3739429..5ac9c70 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,7 @@ master_doc = 'index' # General information about the project. project = u'mwparserfromhell' -copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic' +copyright = u'2012–2018 Ben Kurtovic' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 1a9c542..11e1094 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2018 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2012-2017 Ben Kurtovic" +__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic" __license__ = "MIT License" __version__ = "0.6.dev0" __email__ = "ben.kurtovic@gmail.com" diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 90ee19d..1998368 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) */ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { - #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS #define NOT_A_LINK \ if (!brackets && self->topstack->context & LC_DLTERM) \ return Tokenizer_handle_dl_term(self); \ @@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) PyObject *link, *kwargs; Textbuffer *extra; - if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { + if (self->topstack->context & AGG_NO_EXT_LINKS || + !(Tokenizer_CAN_RECURSE(self))) { NOT_A_LINK; } extra = Textbuffer_new(&self->text); @@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data( else if (data->context & TAG_NOTE_SPACE) { if (data->context & TAG_QUOTED) { data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset - 1; // Will be auto-incremented @@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data( data->context |= TAG_QUOTED; data->quoter = chunk; data->reset = self->head; - if (Tokenizer_push(self, self->topstack->context)) + if (Tokenizer_check_route(self, self->topstack->context) < 0) { + RESET_ROUTE(); + data->context = TAG_ATTR_VALUE; + self->head--; + } + else if (Tokenizer_push(self, self->topstack->context)) return -1; return 0; } @@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset; @@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset; diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c index 062c631..30dc2a1 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -147,6 +147,22 @@ static int compare_nodes( } /* + Remember that the current route (head + context at push) is invalid. + + This will be noticed when calling Tokenizer_check_route with the same head + and context, and the route will be failed immediately. +*/ +void Tokenizer_memoize_bad_route(Tokenizer *self) +{ + route_tree_node *node = malloc(sizeof(route_tree_node)); + if (node) { + node->id = self->topstack->ident; + if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) + free(node); + } +} + +/* Fail the current tokenization route. Discards the current stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the ident of the failed stack so future parsing attempts down this route can be @@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self) uint64_t context = self->topstack->context; PyObject* stack; - route_tree_node *node = malloc(sizeof(route_tree_node)); - if (node) { - node->id = self->topstack->ident; - if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) - free(node); - } - + Tokenizer_memoize_bad_route(self); stack = Tokenizer_pop(self); Py_XDECREF(stack); FAIL_ROUTE(context); @@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self) /* Check if pushing a new route here with the given context would definitely fail, based on a previous call to Tokenizer_fail_route() with the same - stack. + stack. (Or any other call to Tokenizer_memoize_bad_route().) Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the latter case. diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h index 57f4126..f65d102 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.h +++ b/mwparserfromhell/parser/ctokenizer/tok_support.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*); void Tokenizer_delete_top_of_stack(Tokenizer*); PyObject* Tokenizer_pop(Tokenizer*); PyObject* Tokenizer_pop_keeping_context(Tokenizer*); +void Tokenizer_memoize_bad_route(Tokenizer*); void* Tokenizer_fail_route(Tokenizer*); int Tokenizer_check_route(Tokenizer*, uint64_t); void Tokenizer_free_bad_route_tree(Tokenizer*); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d7a0282..1bfbc8d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2018 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -144,6 +144,14 @@ class Tokenizer(object): """Return whether or not our max recursion depth has been exceeded.""" return self._depth < self.MAX_DEPTH + def _memoize_bad_route(self): + """Remember that the current route (head + context at push) is invalid. + + This will be noticed when calling _push with the same head and context, + and the route will be failed immediately. + """ + self._bad_routes.add(self._stack_ident) + def _fail_route(self): """Fail the current tokenization route. @@ -151,7 +159,7 @@ class Tokenizer(object): :exc:`.BadRoute`. """ context = self._context - self._bad_routes.add(self._stack_ident) + self._memoize_bad_route() self._pop() raise BadRoute(context) @@ -506,12 +514,16 @@ class Tokenizer(object): def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" + if self._context & contexts.NO_EXT_LINKS or not self._can_recurse(): + if not brackets and self._context & contexts.DL_TERM: + self._handle_dl_term() + else: + self._emit_text(self._read()) + return + reset = self._head self._head += 1 try: - bad_context = self._context & contexts.NO_EXT_LINKS - if bad_context or not self._can_recurse(): - raise BadRoute() link, extra, delta = self._really_parse_external_link(brackets) except BadRoute: self._head = reset @@ -719,6 +731,7 @@ class Tokenizer(object): elif data.context & data.CX_NOTE_SPACE: if data.context & data.CX_QUOTED: data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset - 1 # Will be auto-incremented return # Break early @@ -743,7 +756,13 @@ class Tokenizer(object): data.context |= data.CX_QUOTED data.quoter = chunk data.reset = self._head - self._push(self._context) + try: + self._push(self._context) + except BadRoute: + # Already failed to parse this as a quoted string + data.context = data.CX_ATTR_VALUE + self._head -= 1 + return continue elif data.context & data.CX_QUOTED: if chunk == data.quoter and not escaped: @@ -845,6 +864,7 @@ class Tokenizer(object): if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset continue @@ -1084,6 +1104,7 @@ class Tokenizer(object): if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset continue From 1a4e2fc01976a9f0a0b56085a0d8525debdf338c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 3 Mar 2018 12:47:45 -0500 Subject: [PATCH 7/9] Add Python 3.7 to trove classifiers. --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ee5fd50..0b33d42 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2018 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -107,6 +107,7 @@ setup( "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", "Topic :: Text Processing :: Markup" ], ) From d0da416e3e1a64f06ea149031c80127b245ed0a1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 3 Mar 2018 12:54:38 -0500 Subject: [PATCH 8/9] Make release script safer. --- scripts/release.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/release.sh b/scripts/release.sh index 4f1e9b0..0d31e15 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -1,5 +1,7 @@ #! /usr/bin/env bash +set -euo pipefail + if [[ -z "$1" ]]; then echo "usage: $0 1.2.3" exit 1 From cb96b4378a6fb0d9a6f2ed4575c83f3a19c8d072 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 3 Mar 2018 13:41:26 -0500 Subject: [PATCH 9/9] release/0.5.1 --- CHANGELOG | 2 +- appveyor.yml | 2 +- docs/changelog.rst | 8 ++++---- mwparserfromhell/__init__.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index ebe4d7d..64f91db 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -v0.6 (unreleased): +v0.5.1 (released March 03, 2018): - Improved behavior when adding parameters to templates (via Template.add()) with poorly formatted whitespace conventions. (#185) diff --git a/appveyor.yml b/appveyor.yml index 70b71b4..e99f54e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1,6 @@ # This config file is used by appveyor.com to build Windows release binaries -version: 0.6.dev0-b{build} +version: 0.5.1-b{build} branches: only: diff --git a/docs/changelog.rst b/docs/changelog.rst index 7aa8f22..4e637d2 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,11 +1,11 @@ Changelog ========= -v0.6 ----- +v0.5.1 +------ -Unreleased -(`changes `__): +`Released March 03, 2018 `_ +(`changes `__): - Improved behavior when adding parameters to templates (via :meth:`.Template.add`) with poorly formatted whitespace conventions. diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 11e1094..ab8514a 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.6.dev0" +__version__ = "0.5.1" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin,