diff --git a/CHANGELOG b/CHANGELOG index bdcf906..64f91db 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,10 @@ +v0.5.1 (released March 03, 2018): + +- Improved behavior when adding parameters to templates (via Template.add()) + with poorly formatted whitespace conventions. (#185) +- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, + quoted attributes. (#190) + v0.5 (released June 23, 2017): - Added Wikicode.contains() to determine whether a Node or Wikicode object is diff --git a/LICENSE b/LICENSE index 588e737..f353cd7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/appveyor.yml b/appveyor.yml index ff2ef4a..e99f54e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1,6 @@ # This config file is used by appveyor.com to build Windows release binaries -version: 0.5-b{build} +version: 0.5.1-b{build} branches: only: diff --git a/docs/changelog.rst b/docs/changelog.rst index cf4e31a..4e637d2 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,19 @@ Changelog ========= +v0.5.1 +------ + +`Released March 03, 2018 `_ +(`changes `__): + +- Improved behavior when adding parameters to templates (via + :meth:`.Template.add`) with poorly formatted whitespace conventions. + (`#185 `_) +- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, + quoted attributes. + (`#190 `_) + v0.5 ---- diff --git a/docs/conf.py b/docs/conf.py index 3739429..5ac9c70 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,7 @@ master_doc = 'index' # General information about the project. project = u'mwparserfromhell' -copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic' +copyright = u'2012–2018 Ben Kurtovic' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 17f9e97..ab8514a 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2018 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -27,9 +27,9 @@ outrageously powerful parser for `MediaWiki `_ wikicode. """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2012, 2013, 2014, 2015, 2016 Ben Kurtovic" +__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.5" +__version__ = "0.5.1" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin, diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 58d25ae..9c058d4 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -101,7 +101,7 @@ class Template(Node): values = tuple(theories.values()) best = max(values) confidence = float(best) / sum(values) - if confidence >= 0.75: + if confidence > 0.5: return tuple(theories.keys())[values.index(best)] @staticmethod @@ -130,6 +130,8 @@ class Template(Node): before_theories = defaultdict(lambda: 0) after_theories = defaultdict(lambda: 0) for param in self.params: + if not param.showkey: + continue if use_names: component = str(param.name) else: diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 90ee19d..1998368 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) */ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { - #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS #define NOT_A_LINK \ if (!brackets && self->topstack->context & LC_DLTERM) \ return Tokenizer_handle_dl_term(self); \ @@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) PyObject *link, *kwargs; Textbuffer *extra; - if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { + if (self->topstack->context & AGG_NO_EXT_LINKS || + !(Tokenizer_CAN_RECURSE(self))) { NOT_A_LINK; } extra = Textbuffer_new(&self->text); @@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data( else if (data->context & TAG_NOTE_SPACE) { if (data->context & TAG_QUOTED) { data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset - 1; // Will be auto-incremented @@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data( data->context |= TAG_QUOTED; data->quoter = chunk; data->reset = self->head; - if (Tokenizer_push(self, self->topstack->context)) + if (Tokenizer_check_route(self, self->topstack->context) < 0) { + RESET_ROUTE(); + data->context = TAG_ATTR_VALUE; + self->head--; + } + else if (Tokenizer_push(self, self->topstack->context)) return -1; return 0; } @@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset; @@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset; diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c index 062c631..30dc2a1 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -147,6 +147,22 @@ static int compare_nodes( } /* + Remember that the current route (head + context at push) is invalid. + + This will be noticed when calling Tokenizer_check_route with the same head + and context, and the route will be failed immediately. +*/ +void Tokenizer_memoize_bad_route(Tokenizer *self) +{ + route_tree_node *node = malloc(sizeof(route_tree_node)); + if (node) { + node->id = self->topstack->ident; + if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) + free(node); + } +} + +/* Fail the current tokenization route. Discards the current stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the ident of the failed stack so future parsing attempts down this route can be @@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self) uint64_t context = self->topstack->context; PyObject* stack; - route_tree_node *node = malloc(sizeof(route_tree_node)); - if (node) { - node->id = self->topstack->ident; - if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) - free(node); - } - + Tokenizer_memoize_bad_route(self); stack = Tokenizer_pop(self); Py_XDECREF(stack); FAIL_ROUTE(context); @@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self) /* Check if pushing a new route here with the given context would definitely fail, based on a previous call to Tokenizer_fail_route() with the same - stack. + stack. (Or any other call to Tokenizer_memoize_bad_route().) Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the latter case. diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h index 57f4126..f65d102 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.h +++ b/mwparserfromhell/parser/ctokenizer/tok_support.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*); void Tokenizer_delete_top_of_stack(Tokenizer*); PyObject* Tokenizer_pop(Tokenizer*); PyObject* Tokenizer_pop_keeping_context(Tokenizer*); +void Tokenizer_memoize_bad_route(Tokenizer*); void* Tokenizer_fail_route(Tokenizer*); int Tokenizer_check_route(Tokenizer*, uint64_t); void Tokenizer_free_bad_route_tree(Tokenizer*); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d7a0282..1bfbc8d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2018 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -144,6 +144,14 @@ class Tokenizer(object): """Return whether or not our max recursion depth has been exceeded.""" return self._depth < self.MAX_DEPTH + def _memoize_bad_route(self): + """Remember that the current route (head + context at push) is invalid. + + This will be noticed when calling _push with the same head and context, + and the route will be failed immediately. + """ + self._bad_routes.add(self._stack_ident) + def _fail_route(self): """Fail the current tokenization route. @@ -151,7 +159,7 @@ class Tokenizer(object): :exc:`.BadRoute`. """ context = self._context - self._bad_routes.add(self._stack_ident) + self._memoize_bad_route() self._pop() raise BadRoute(context) @@ -506,12 +514,16 @@ class Tokenizer(object): def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" + if self._context & contexts.NO_EXT_LINKS or not self._can_recurse(): + if not brackets and self._context & contexts.DL_TERM: + self._handle_dl_term() + else: + self._emit_text(self._read()) + return + reset = self._head self._head += 1 try: - bad_context = self._context & contexts.NO_EXT_LINKS - if bad_context or not self._can_recurse(): - raise BadRoute() link, extra, delta = self._really_parse_external_link(brackets) except BadRoute: self._head = reset @@ -719,6 +731,7 @@ class Tokenizer(object): elif data.context & data.CX_NOTE_SPACE: if data.context & data.CX_QUOTED: data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset - 1 # Will be auto-incremented return # Break early @@ -743,7 +756,13 @@ class Tokenizer(object): data.context |= data.CX_QUOTED data.quoter = chunk data.reset = self._head - self._push(self._context) + try: + self._push(self._context) + except BadRoute: + # Already failed to parse this as a quoted string + data.context = data.CX_ATTR_VALUE + self._head -= 1 + return continue elif data.context & data.CX_QUOTED: if chunk == data.quoter and not escaped: @@ -845,6 +864,7 @@ class Tokenizer(object): if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset continue @@ -1084,6 +1104,7 @@ class Tokenizer(object): if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset continue diff --git a/scripts/release.sh b/scripts/release.sh index 4f1e9b0..0d31e15 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -1,5 +1,7 @@ #! /usr/bin/env bash +set -euo pipefail + if [[ -z "$1" ]]; then echo "usage: $0 1.2.3" exit 1 diff --git a/setup.py b/setup.py index ee5fd50..0b33d42 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2018 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -107,6 +107,7 @@ setup( "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", "Topic :: Text Processing :: Markup" ], ) diff --git a/tests/test_template.py b/tests/test_template.py index 76a45cf..5b939f0 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -21,6 +21,7 @@ # SOFTWARE. from __future__ import unicode_literals +from difflib import unified_diff try: import unittest2 as unittest @@ -30,6 +31,8 @@ except ImportError: from mwparserfromhell.compat import str from mwparserfromhell.nodes import HTMLEntity, Template, Text from mwparserfromhell.nodes.extras import Parameter +from mwparserfromhell import parse + from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext pgens = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=True) @@ -287,7 +290,7 @@ class TestTemplate(TreeEqualityTestCase): self.assertIsInstance(node12.params[1].value.get(1), HTMLEntity) self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|\nh = i}}", node13) self.assertEqual("{{a\n|b =c\n|d = e|f =g\n|h = i\n|j =k\n}}", node14) - self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |h =i}}", node15) + self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |\nh = i}}", node15) self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|h=i}}", node16) self.assertEqual("{{a|b|c}}", node17) self.assertEqual("{{a|b|3=c}}", node18) @@ -439,5 +442,172 @@ class TestTemplate(TreeEqualityTestCase): self.assertEqual("{{foo|a=b|c=d|e=f|a=|a=b}}", node26) self.assertRaises(ValueError, node27.remove, node28.get(1)) + def test_formatting(self): + """test realistic param manipulation with complex whitespace formatting + (assumes that parsing works correctly)""" + tests = [ + # https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004 + ("""{{Infobox U.S. county +| county = Lamar County +| state = Georgia +| seal = +| founded = 1920 +| seat wl = Barnesville +| largest city wl = Barnesville +| area_total_sq_mi = 186 +| area_land_sq_mi = 184 +| area_water_sq_mi = 2.3 +| area percentage = 1.3% +| census yr = 2010 +| pop = 18317 +| density_sq_mi = 100 +| time zone = Eastern +| footnotes = +| web = www.lamarcountyga.com +| ex image = Lamar County Georgia Courthouse.jpg +| ex image cap = Lamar County courthouse in Barnesville +| district = 3rd +| named for = [[Lucius Quintus Cincinnatus Lamar II]] +}}""", + """@@ -11,4 +11,4 @@ + | area percentage = 1.3% +-| census yr = 2010 +-| pop = 18317 ++| census estimate yr = 2016 ++| pop = 12345example ref + | density_sq_mi = 100"""), + + # https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760 + ("""{{Infobox U.S. County| + county = Rockdale County | + state = Georgia | + seal = | + founded = October 18, 1870 | + seat wl = Conyers | + largest city wl = Conyers | + area_total_sq_mi = 132 | + area_land_sq_mi = 130 | + area_water_sq_mi = 2.3 | + area percentage = 1.7% | + census yr = 2010| + pop = 85215 | + density_sq_mi = 657 | + web = www.rockdalecounty.org +| ex image = Rockdale-county-courthouse.jpg +| ex image cap = Rockdale County Courthouse in Conyers +| district = 4th +| time zone= Eastern +}}""", + """@@ -11,4 +11,4 @@ + area percentage = 1.7% | +- census yr = 2010| +- pop = 85215 | ++ census estimate yr = 2016 | ++ pop = 12345example ref | + density_sq_mi = 657 |"""), + + # https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413 + ("""{{Infobox U.S. County| +| county = Spalding County | +| state = Georgia | +| seal = | +| founded = 1851 | +| seat wl = Griffin | +| largest city wl = Griffin | +| area_total_sq_mi = 200 | +| area_land_sq_mi = 196 | +| area_water_sq_mi = 3.1 | +| area percentage = 1.6% | +| census yr = 2010| +| pop = 64073 | +| density_sq_mi = 326 | +| web = www.spaldingcounty.com | +| named for = [[Thomas Spalding]] +| ex image = Spalding County Courthouse (NE corner).JPG +| ex image cap = Spalding County Courthouse in Griffin +| district = 3rd +| time zone = Eastern +}}""", + """@@ -11,4 +11,4 @@ + | area percentage = 1.6% | +-| census yr = 2010| +-| pop = 64073 | ++| ++| census estimate yr = 2016 | pop = 12345example ref | + | density_sq_mi = 326 |"""), + + # https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648 + ("""{{Infobox U.S. county + |county = Clinton County + |state = Illinois +| ex image = File:Clinton County Courthouse, Carlyle.jpg +| ex image cap = [[Clinton County Courthouse (Illinois)|Clinton County Courthouse]] + |seal = + |founded = 1824 + |named for = [[DeWitt Clinton]] + |seat wl= Carlyle +| largest city wl = Breese + |time zone=Central + |area_total_sq_mi = 503 + |area_land_sq_mi = 474 + |area_water_sq_mi = 29 + |area percentage = 5.8% + |census yr = 2010 + |pop = 37762 + |density_sq_mi = 80 + |web = www.clintonco.illinois.gov +| district = 15th +}}""", + """@@ -15,4 +15,4 @@ + |area percentage = 5.8% +- |census yr = 2010 +- |pop = 37762 ++ |census estimate yr = 2016 ++ |pop = 12345example ref + |density_sq_mi = 80"""), + + # https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800 + ("""{{Infobox U.S. county | + county = Winnebago County | + state = Illinois | + seal = Winnebago County il seal.png | + named for = [[Winnebago (tribe)|Winnebago Tribe]] | + seat wl= Rockford | + largest city wl = Rockford| + area_total_sq_mi = 519 | + area_land_sq_mi = 513| + area_water_sq_mi = 5.9 | + area percentage = 1.1% | + census yr = 2010| + pop = 295266 | + density_sq_mi = 575 +| web = www.wincoil.us +| founded year = 1836 +| founded date = January 16 +| time zone = Central +| district = 16th +| district2 = 17th +}}""", + """@@ -11,4 +11,4 @@ + area percentage = 1.1% | +- census yr = 2010| +- pop = 295266 | ++ census estimate yr = 2016| ++ pop = 12345example ref | + density_sq_mi = 575""")] + + for (original, expected) in tests: + code = parse(original) + template = code.filter_templates()[0] + template.add("pop", "12345example ref") + template.add('census estimate yr', "2016", before="pop") + template.remove("census yr") + + oldlines = original.splitlines(True) + newlines = str(code).splitlines(True) + difflines = unified_diff(oldlines, newlines, n=1) + diff = "".join(list(difflines)[2:]).strip() + self.assertEqual(expected, diff) + if __name__ == "__main__": unittest.main(verbosity=2)