Merge develop into master (release/0.5.1)

6 years ago · c8df09469e
--- a/+ 7
+++ b/+ 7
@@ -1,3 +1,10 @@
 v0.5.1 (released March 03, 2018):

 - Improved behavior when adding parameters to templates (via Template.add())
  with poorly formatted whitespace conventions. (#185)
 - Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
  quoted attributes. (#190)

 v0.5 (released June 23, 2017):

 - Added Wikicode.contains() to determine whether a Node or Wikicode object is
--- a/+ 1
+++ b/+ 1
@@ -1,4 +1,4 @@
 Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,6 +1,6 @@
 # This config file is used by appveyor.com to build Windows release binaries

 version: 0.5-b{build}
 version: 0.5.1-b{build}

 branches:
  only:
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,6 +1,19 @@
 Changelog
 =========

 v0.5.1
 ------

 `Released March 03, 2018 <https://github.com/earwig/mwparserfromhell/tree/v0.5.1>`_
 (`changes <https://github.com/earwig/mwparserfromhell/compare/v0.5...v0.5.1>`__):

 - Improved behavior when adding parameters to templates (via
  :meth:`.Template.add`) with poorly formatted whitespace conventions.
  (`#185 <https://github.com/earwig/mwparserfromhell/issues/185>`_)
 - Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
  quoted attributes.
  (`#190 <https://github.com/earwig/mwparserfromhell/issues/190>`_)

 v0.5
 ----

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -42,7 +42,7 @@ master_doc = 'index'

 # General information about the project.
 project = u'mwparserfromhell'
 copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic'
 copyright = u'2012–2018 Ben Kurtovic'

 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
--- a/mwparserfromhell/init.py
+++ b/mwparserfromhell/init.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -27,9 +27,9 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode.
 """

 __author__ = "Ben Kurtovic"
 __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015, 2016 Ben Kurtovic"
 __copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic"
 __license__ = "MIT License"
 __version__ = "0.5"
 __version__ = "0.5.1"
 __email__ = "ben.kurtovic@gmail.com"

 from . import (compat, definitions, nodes, parser, smart_list, string_mixin,
--- a/mwparserfromhell/nodes/template.py
+++ b/mwparserfromhell/nodes/template.py
@@ -101,7 +101,7 @@ class Template(Node):
            values = tuple(theories.values())
            best = max(values)
            confidence = float(best) / sum(values)
            if confidence >= 0.75:
            if confidence > 0.5:
                return tuple(theories.keys())[values.index(best)]

    @staticmethod
@@ -130,6 +130,8 @@ class Template(Node):
        before_theories = defaultdict(lambda: 0)
        after_theories = defaultdict(lambda: 0)
        for param in self.params:
            if not param.showkey:
                continue
            if use_names:
                component = str(param.name)
            else:
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -1,5 +1,5 @@
 /*
 Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
 */
 static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
 {
    #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS
    #define NOT_A_LINK                                        \
        if (!brackets && self->topstack->context & LC_DLTERM) \
            return Tokenizer_handle_dl_term(self);            \
@@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
    PyObject *link, *kwargs;
    Textbuffer *extra;

    if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
    if (self->topstack->context & AGG_NO_EXT_LINKS ||
            !(Tokenizer_CAN_RECURSE(self))) {
        NOT_A_LINK;
    }
    extra = Textbuffer_new(&self->text);
@@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data(
    else if (data->context & TAG_NOTE_SPACE) {
        if (data->context & TAG_QUOTED) {
            data->context = TAG_ATTR_VALUE;
            Tokenizer_memoize_bad_route(self);
            trash = Tokenizer_pop(self);
            Py_XDECREF(trash);
            self->head = data->reset - 1;  // Will be auto-incremented
@@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data(
                data->context |= TAG_QUOTED;
                data->quoter = chunk;
                data->reset = self->head;
                if (Tokenizer_push(self, self->topstack->context))
                if (Tokenizer_check_route(self, self->topstack->context) < 0) {
                    RESET_ROUTE();
                    data->context = TAG_ATTR_VALUE;
                    self->head--;
                }
                else if (Tokenizer_push(self, self->topstack->context))
                    return -1;
                return 0;
            }
@@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
                if (data->context & TAG_QUOTED) {
                    // Unclosed attribute quote: reset, don't die
                    data->context = TAG_ATTR_VALUE;
                    Tokenizer_memoize_bad_route(self);
                    trash = Tokenizer_pop(self);
                    Py_XDECREF(trash);
                    self->head = data->reset;
@@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token
                if (data->context & TAG_QUOTED) {
                    // Unclosed attribute quote: reset, don't die
                    data->context = TAG_ATTR_VALUE;
                    Tokenizer_memoize_bad_route(self);
                    trash = Tokenizer_pop(self);
                    Py_XDECREF(trash);
                    self->head = data->reset;
--- a/mwparserfromhell/parser/ctokenizer/tok_support.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.c
@@ -1,5 +1,5 @@
 /*
 Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -147,6 +147,22 @@ static int compare_nodes(
 }

 /*
    Remember that the current route (head + context at push) is invalid.

    This will be noticed when calling Tokenizer_check_route with the same head
    and context, and the route will be failed immediately.
 */
 void Tokenizer_memoize_bad_route(Tokenizer *self)
 {
    route_tree_node *node = malloc(sizeof(route_tree_node));
    if (node) {
        node->id = self->topstack->ident;
        if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
            free(node);
    }
 }

 /*
    Fail the current tokenization route. Discards the current
    stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the
    ident of the failed stack so future parsing attempts down this route can be
@@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
    uint64_t context = self->topstack->context;
    PyObject* stack;

    route_tree_node *node = malloc(sizeof(route_tree_node));
    if (node) {
        node->id = self->topstack->ident;
        if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
            free(node);
    }

    Tokenizer_memoize_bad_route(self);
    stack = Tokenizer_pop(self);
    Py_XDECREF(stack);
    FAIL_ROUTE(context);
@@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
 /*
    Check if pushing a new route here with the given context would definitely
    fail, based on a previous call to Tokenizer_fail_route() with the same
    stack.
    stack. (Or any other call to Tokenizer_memoize_bad_route().)

    Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the
    latter case.
--- a/mwparserfromhell/parser/ctokenizer/tok_support.h
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.h
@@ -1,5 +1,5 @@
 /*
 Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*);
 void Tokenizer_delete_top_of_stack(Tokenizer*);
 PyObject* Tokenizer_pop(Tokenizer*);
 PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
 void Tokenizer_memoize_bad_route(Tokenizer*);
 void* Tokenizer_fail_route(Tokenizer*);
 int Tokenizer_check_route(Tokenizer*, uint64_t);
 void Tokenizer_free_bad_route_tree(Tokenizer*);
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -144,6 +144,14 @@ class Tokenizer(object):
        """Return whether or not our max recursion depth has been exceeded."""
        return self._depth < self.MAX_DEPTH

    def _memoize_bad_route(self):
        """Remember that the current route (head + context at push) is invalid.

        This will be noticed when calling _push with the same head and context,
        and the route will be failed immediately.
        """
        self._bad_routes.add(self._stack_ident)

    def _fail_route(self):
        """Fail the current tokenization route.

@@ -151,7 +159,7 @@ class Tokenizer(object):
        :exc:`.BadRoute`.
        """
        context = self._context
        self._bad_routes.add(self._stack_ident)
        self._memoize_bad_route()
        self._pop()
        raise BadRoute(context)

@@ -506,12 +514,16 @@ class Tokenizer(object):

    def _parse_external_link(self, brackets):
        """Parse an external link at the head of the wikicode string."""
        if self._context & contexts.NO_EXT_LINKS or not self._can_recurse():
            if not brackets and self._context & contexts.DL_TERM:
                self._handle_dl_term()
            else:
                self._emit_text(self._read())
            return

        reset = self._head
        self._head += 1
        try:
            bad_context = self._context & contexts.NO_EXT_LINKS
            if bad_context or not self._can_recurse():
                raise BadRoute()
            link, extra, delta = self._really_parse_external_link(brackets)
        except BadRoute:
            self._head = reset
@@ -719,6 +731,7 @@ class Tokenizer(object):
            elif data.context & data.CX_NOTE_SPACE:
                if data.context & data.CX_QUOTED:
                    data.context = data.CX_ATTR_VALUE
                    self._memoize_bad_route()
                    self._pop()
                    self._head = data.reset - 1  # Will be auto-incremented
                    return  # Break early
@@ -743,7 +756,13 @@ class Tokenizer(object):
                        data.context |= data.CX_QUOTED
                        data.quoter = chunk
                        data.reset = self._head
                        self._push(self._context)
                        try:
                            self._push(self._context)
                        except BadRoute:
                            # Already failed to parse this as a quoted string
                            data.context = data.CX_ATTR_VALUE
                            self._head -= 1
                            return
                        continue
                elif data.context & data.CX_QUOTED:
                    if chunk == data.quoter and not escaped:
@@ -845,6 +864,7 @@ class Tokenizer(object):
                    if data.context & data.CX_QUOTED:
                        # Unclosed attribute quote: reset, don't die
                        data.context = data.CX_ATTR_VALUE
                        self._memoize_bad_route()
                        self._pop()
                        self._head = data.reset
                        continue
@@ -1084,6 +1104,7 @@ class Tokenizer(object):
                    if data.context & data.CX_QUOTED:
                        # Unclosed attribute quote: reset, don't die
                        data.context = data.CX_ATTR_VALUE
                        self._memoize_bad_route()
                        self._pop()
                        self._head = data.reset
                        continue
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -1,5 +1,7 @@
 #! /usr/bin/env bash

 set -euo pipefail

 if [[ -z "$1" ]]; then
    echo "usage: $0 1.2.3"
    exit 1
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -107,6 +107,7 @@ setup(
        "Programming Language :: Python :: 3.4",
        "Programming Language :: Python :: 3.5",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Topic :: Text Processing :: Markup"
    ],
 )
--- a/tests/test_template.py
+++ b/tests/test_template.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,7 @@
 # SOFTWARE.

 from __future__ import unicode_literals
 from difflib import unified_diff

 try:
    import unittest2 as unittest
@@ -30,6 +31,8 @@ except ImportError:
 from mwparserfromhell.compat import str
 from mwparserfromhell.nodes import HTMLEntity, Template, Text
 from mwparserfromhell.nodes.extras import Parameter
 from mwparserfromhell import parse

 from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext

 pgens = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=True)
@@ -287,7 +290,7 @@ class TestTemplate(TreeEqualityTestCase):
        self.assertIsInstance(node12.params[1].value.get(1), HTMLEntity)
        self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|\nh = i}}", node13)
        self.assertEqual("{{a\n|b =c\n|d = e|f =g\n|h = i\n|j =k\n}}", node14)
        self.assertEqual("{{a|b  = c\n|\nd  = e|\nf  =g |h  =i}}", node15)
        self.assertEqual("{{a|b  = c\n|\nd  = e|\nf  =g |\nh  = i}}", node15)
        self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|h=i}}", node16)
        self.assertEqual("{{a|b|c}}", node17)
        self.assertEqual("{{a|b|3=c}}", node18)
@@ -439,5 +442,172 @@ class TestTemplate(TreeEqualityTestCase):
        self.assertEqual("{{foo|a=b|c=d|e=f|a=|a=b}}", node26)
        self.assertRaises(ValueError, node27.remove, node28.get(1))

    def test_formatting(self):
        """test realistic param manipulation with complex whitespace formatting
        (assumes that parsing works correctly)"""
        tests = [
    # https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004
    ("""{{Infobox U.S. county
 | county = Lamar County
 | state = Georgia
 | seal =
 | founded = 1920
 | seat wl = Barnesville
 | largest city wl = Barnesville
 | area_total_sq_mi = 186
 | area_land_sq_mi = 184
 | area_water_sq_mi = 2.3
 | area percentage = 1.3%
 | census yr = 2010
 | pop = 18317
 | density_sq_mi = 100
 | time zone = Eastern
 | footnotes =
 | web = www.lamarcountyga.com
 | ex image = Lamar County Georgia Courthouse.jpg
 | ex image cap = Lamar County courthouse in Barnesville
 | district = 3rd
 | named for = [[Lucius Quintus Cincinnatus Lamar II]]
 }}""",
    """@@ -11,4 +11,4 @@
 | area percentage = 1.3%
 -| census yr = 2010
 -| pop = 18317
 +| census estimate yr = 2016
 +| pop = 12345<ref>example ref</ref>
 | density_sq_mi = 100"""),

    # https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760
    ("""{{Infobox U.S. County|
 county = Rockdale County |
 state = Georgia |
 seal =  |
 founded = October 18, 1870 |
 seat wl = Conyers |
 largest city wl = Conyers |
 area_total_sq_mi = 132 |
 area_land_sq_mi = 130 |
 area_water_sq_mi = 2.3 |
 area percentage = 1.7% |
 census yr = 2010|
 pop = 85215 |
 density_sq_mi = 657 |
 web = www.rockdalecounty.org
 | ex image = Rockdale-county-courthouse.jpg
 | ex image cap = Rockdale County Courthouse in Conyers
 | district = 4th
 | time zone= Eastern
 }}""",
    """@@ -11,4 +11,4 @@
  area percentage = 1.7% |
 - census yr = 2010|
 - pop = 85215 |
 + census estimate yr = 2016 |
 + pop = 12345<ref>example ref</ref> |
  density_sq_mi = 657 |"""),

    # https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413
    ("""{{Infobox U.S. County|
 | county = Spalding County |
 | state = Georgia |
 | seal =  |
 | founded = 1851 |
 | seat wl = Griffin |
 | largest city wl = Griffin |
 | area_total_sq_mi = 200 |
 | area_land_sq_mi = 196 |
 | area_water_sq_mi = 3.1 |
 | area percentage = 1.6% |
 | census yr = 2010|
 | pop = 64073 |
 | density_sq_mi = 326 |
 | web = www.spaldingcounty.com |
 | named for = [[Thomas Spalding]]
 | ex image = Spalding County Courthouse (NE corner).JPG
 | ex image cap = Spalding County Courthouse in Griffin
 | district = 3rd
 | time zone = Eastern
 }}""",
    """@@ -11,4 +11,4 @@
 | area percentage = 1.6% |
 -| census yr = 2010|
 -| pop = 64073 |
 +|
 +| census estimate yr = 2016 | pop = 12345<ref>example ref</ref> |
 | density_sq_mi = 326 |"""),

    # https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648
    ("""{{Infobox U.S. county
 |county  = Clinton County
 |state = Illinois
 | ex image           = File:Clinton County Courthouse, Carlyle.jpg
 | ex image cap       = [[Clinton County Courthouse (Illinois)|Clinton County Courthouse]]
 |seal =
 |founded = 1824
 |named for = [[DeWitt Clinton]]
 |seat wl= Carlyle
 | largest city wl = Breese
 |time zone=Central
 |area_total_sq_mi = 503
 |area_land_sq_mi = 474
 |area_water_sq_mi = 29
 |area percentage = 5.8%
 |census yr = 2010
 |pop = 37762
 |density_sq_mi = 80
 |web = www.clintonco.illinois.gov
 | district = 15th
 }}""",
    """@@ -15,4 +15,4 @@
  |area percentage = 5.8%
 - |census yr = 2010
 - |pop = 37762
 + |census estimate yr = 2016
 + |pop = 12345<ref>example ref</ref>
  |density_sq_mi = 80"""),

    # https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800
    ("""{{Infobox U.S. county |
 county  = Winnebago County |
 state = Illinois |
 seal = Winnebago County il seal.png |
 named for = [[Winnebago (tribe)|Winnebago Tribe]] |
 seat wl= Rockford |
 largest city wl = Rockford|
 area_total_sq_mi = 519 |
 area_land_sq_mi = 513|
 area_water_sq_mi = 5.9 |
 area percentage = 1.1% |
 census yr = 2010|
 pop = 295266 |
 density_sq_mi = 575
 | web = www.wincoil.us
 | founded year = 1836
 | founded date = January 16
 | time zone = Central
 | district = 16th
 | district2 = 17th
 }}""",
    """@@ -11,4 +11,4 @@
  area percentage = 1.1% |
 - census yr = 2010|
 - pop = 295266 |
 + census estimate yr = 2016|
 + pop = 12345<ref>example ref</ref> |
  density_sq_mi = 575""")]

        for (original, expected) in tests:
            code = parse(original)
            template = code.filter_templates()[0]
            template.add("pop", "12345<ref>example ref</ref>")
            template.add('census estimate yr', "2016", before="pop")
            template.remove("census yr")

            oldlines = original.splitlines(True)
            newlines = str(code).splitlines(True)
            difflines = unified_diff(oldlines, newlines, n=1)
            diff = "".join(list(difflines)[2:]).strip()
            self.assertEqual(expected, diff)

 if __name__ == "__main__":
    unittest.main(verbosity=2)