@@ -1,3 +1,10 @@ | |||
v0.5.1 (released March 03, 2018): | |||
- Improved behavior when adding parameters to templates (via Template.add()) | |||
with poorly formatted whitespace conventions. (#185) | |||
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, | |||
quoted attributes. (#190) | |||
v0.5 (released June 23, 2017): | |||
- Added Wikicode.contains() to determine whether a Node or Wikicode object is | |||
@@ -1,4 +1,4 @@ | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# This config file is used by appveyor.com to build Windows release binaries | |||
version: 0.5-b{build} | |||
version: 0.5.1-b{build} | |||
branches: | |||
only: | |||
@@ -1,6 +1,19 @@ | |||
Changelog | |||
========= | |||
v0.5.1 | |||
------ | |||
`Released March 03, 2018 <https://github.com/earwig/mwparserfromhell/tree/v0.5.1>`_ | |||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.5...v0.5.1>`__): | |||
- Improved behavior when adding parameters to templates (via | |||
:meth:`.Template.add`) with poorly formatted whitespace conventions. | |||
(`#185 <https://github.com/earwig/mwparserfromhell/issues/185>`_) | |||
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, | |||
quoted attributes. | |||
(`#190 <https://github.com/earwig/mwparserfromhell/issues/190>`_) | |||
v0.5 | |||
---- | |||
@@ -42,7 +42,7 @@ master_doc = 'index' | |||
# General information about the project. | |||
project = u'mwparserfromhell' | |||
copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic' | |||
copyright = u'2012–2018 Ben Kurtovic' | |||
# The version info for the project you're documenting, acts as replacement for | |||
# |version| and |release|, also used in various other places throughout the | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -27,9 +27,9 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode. | |||
""" | |||
__author__ = "Ben Kurtovic" | |||
__copyright__ = "Copyright (C) 2012, 2013, 2014, 2015, 2016 Ben Kurtovic" | |||
__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic" | |||
__license__ = "MIT License" | |||
__version__ = "0.5" | |||
__version__ = "0.5.1" | |||
__email__ = "ben.kurtovic@gmail.com" | |||
from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | |||
@@ -101,7 +101,7 @@ class Template(Node): | |||
values = tuple(theories.values()) | |||
best = max(values) | |||
confidence = float(best) / sum(values) | |||
if confidence >= 0.75: | |||
if confidence > 0.5: | |||
return tuple(theories.keys())[values.index(best)] | |||
@staticmethod | |||
@@ -130,6 +130,8 @@ class Template(Node): | |||
before_theories = defaultdict(lambda: 0) | |||
after_theories = defaultdict(lambda: 0) | |||
for param in self.params: | |||
if not param.showkey: | |||
continue | |||
if use_names: | |||
component = str(param.name) | |||
else: | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) | |||
*/ | |||
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | |||
{ | |||
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS | |||
#define NOT_A_LINK \ | |||
if (!brackets && self->topstack->context & LC_DLTERM) \ | |||
return Tokenizer_handle_dl_term(self); \ | |||
@@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | |||
PyObject *link, *kwargs; | |||
Textbuffer *extra; | |||
if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { | |||
if (self->topstack->context & AGG_NO_EXT_LINKS || | |||
!(Tokenizer_CAN_RECURSE(self))) { | |||
NOT_A_LINK; | |||
} | |||
extra = Textbuffer_new(&self->text); | |||
@@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data( | |||
else if (data->context & TAG_NOTE_SPACE) { | |||
if (data->context & TAG_QUOTED) { | |||
data->context = TAG_ATTR_VALUE; | |||
Tokenizer_memoize_bad_route(self); | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = data->reset - 1; // Will be auto-incremented | |||
@@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data( | |||
data->context |= TAG_QUOTED; | |||
data->quoter = chunk; | |||
data->reset = self->head; | |||
if (Tokenizer_push(self, self->topstack->context)) | |||
if (Tokenizer_check_route(self, self->topstack->context) < 0) { | |||
RESET_ROUTE(); | |||
data->context = TAG_ATTR_VALUE; | |||
self->head--; | |||
} | |||
else if (Tokenizer_push(self, self->topstack->context)) | |||
return -1; | |||
return 0; | |||
} | |||
@@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) | |||
if (data->context & TAG_QUOTED) { | |||
// Unclosed attribute quote: reset, don't die | |||
data->context = TAG_ATTR_VALUE; | |||
Tokenizer_memoize_bad_route(self); | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = data->reset; | |||
@@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token | |||
if (data->context & TAG_QUOTED) { | |||
// Unclosed attribute quote: reset, don't die | |||
data->context = TAG_ATTR_VALUE; | |||
Tokenizer_memoize_bad_route(self); | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = data->reset; | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -147,6 +147,22 @@ static int compare_nodes( | |||
} | |||
/* | |||
Remember that the current route (head + context at push) is invalid. | |||
This will be noticed when calling Tokenizer_check_route with the same head | |||
and context, and the route will be failed immediately. | |||
*/ | |||
void Tokenizer_memoize_bad_route(Tokenizer *self) | |||
{ | |||
route_tree_node *node = malloc(sizeof(route_tree_node)); | |||
if (node) { | |||
node->id = self->topstack->ident; | |||
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) | |||
free(node); | |||
} | |||
} | |||
/* | |||
Fail the current tokenization route. Discards the current | |||
stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the | |||
ident of the failed stack so future parsing attempts down this route can be | |||
@@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self) | |||
uint64_t context = self->topstack->context; | |||
PyObject* stack; | |||
route_tree_node *node = malloc(sizeof(route_tree_node)); | |||
if (node) { | |||
node->id = self->topstack->ident; | |||
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) | |||
free(node); | |||
} | |||
Tokenizer_memoize_bad_route(self); | |||
stack = Tokenizer_pop(self); | |||
Py_XDECREF(stack); | |||
FAIL_ROUTE(context); | |||
@@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self) | |||
/* | |||
Check if pushing a new route here with the given context would definitely | |||
fail, based on a previous call to Tokenizer_fail_route() with the same | |||
stack. | |||
stack. (Or any other call to Tokenizer_memoize_bad_route().) | |||
Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the | |||
latter case. | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*); | |||
void Tokenizer_delete_top_of_stack(Tokenizer*); | |||
PyObject* Tokenizer_pop(Tokenizer*); | |||
PyObject* Tokenizer_pop_keeping_context(Tokenizer*); | |||
void Tokenizer_memoize_bad_route(Tokenizer*); | |||
void* Tokenizer_fail_route(Tokenizer*); | |||
int Tokenizer_check_route(Tokenizer*, uint64_t); | |||
void Tokenizer_free_bad_route_tree(Tokenizer*); | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -144,6 +144,14 @@ class Tokenizer(object): | |||
"""Return whether or not our max recursion depth has been exceeded.""" | |||
return self._depth < self.MAX_DEPTH | |||
def _memoize_bad_route(self): | |||
"""Remember that the current route (head + context at push) is invalid. | |||
This will be noticed when calling _push with the same head and context, | |||
and the route will be failed immediately. | |||
""" | |||
self._bad_routes.add(self._stack_ident) | |||
def _fail_route(self): | |||
"""Fail the current tokenization route. | |||
@@ -151,7 +159,7 @@ class Tokenizer(object): | |||
:exc:`.BadRoute`. | |||
""" | |||
context = self._context | |||
self._bad_routes.add(self._stack_ident) | |||
self._memoize_bad_route() | |||
self._pop() | |||
raise BadRoute(context) | |||
@@ -506,12 +514,16 @@ class Tokenizer(object): | |||
def _parse_external_link(self, brackets): | |||
"""Parse an external link at the head of the wikicode string.""" | |||
if self._context & contexts.NO_EXT_LINKS or not self._can_recurse(): | |||
if not brackets and self._context & contexts.DL_TERM: | |||
self._handle_dl_term() | |||
else: | |||
self._emit_text(self._read()) | |||
return | |||
reset = self._head | |||
self._head += 1 | |||
try: | |||
bad_context = self._context & contexts.NO_EXT_LINKS | |||
if bad_context or not self._can_recurse(): | |||
raise BadRoute() | |||
link, extra, delta = self._really_parse_external_link(brackets) | |||
except BadRoute: | |||
self._head = reset | |||
@@ -719,6 +731,7 @@ class Tokenizer(object): | |||
elif data.context & data.CX_NOTE_SPACE: | |||
if data.context & data.CX_QUOTED: | |||
data.context = data.CX_ATTR_VALUE | |||
self._memoize_bad_route() | |||
self._pop() | |||
self._head = data.reset - 1 # Will be auto-incremented | |||
return # Break early | |||
@@ -743,7 +756,13 @@ class Tokenizer(object): | |||
data.context |= data.CX_QUOTED | |||
data.quoter = chunk | |||
data.reset = self._head | |||
self._push(self._context) | |||
try: | |||
self._push(self._context) | |||
except BadRoute: | |||
# Already failed to parse this as a quoted string | |||
data.context = data.CX_ATTR_VALUE | |||
self._head -= 1 | |||
return | |||
continue | |||
elif data.context & data.CX_QUOTED: | |||
if chunk == data.quoter and not escaped: | |||
@@ -845,6 +864,7 @@ class Tokenizer(object): | |||
if data.context & data.CX_QUOTED: | |||
# Unclosed attribute quote: reset, don't die | |||
data.context = data.CX_ATTR_VALUE | |||
self._memoize_bad_route() | |||
self._pop() | |||
self._head = data.reset | |||
continue | |||
@@ -1084,6 +1104,7 @@ class Tokenizer(object): | |||
if data.context & data.CX_QUOTED: | |||
# Unclosed attribute quote: reset, don't die | |||
data.context = data.CX_ATTR_VALUE | |||
self._memoize_bad_route() | |||
self._pop() | |||
self._head = data.reset | |||
continue | |||
@@ -1,5 +1,7 @@ | |||
#! /usr/bin/env bash | |||
set -euo pipefail | |||
if [[ -z "$1" ]]; then | |||
echo "usage: $0 1.2.3" | |||
exit 1 | |||
@@ -1,7 +1,7 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -107,6 +107,7 @@ setup( | |||
"Programming Language :: Python :: 3.4", | |||
"Programming Language :: Python :: 3.5", | |||
"Programming Language :: Python :: 3.6", | |||
"Programming Language :: Python :: 3.7", | |||
"Topic :: Text Processing :: Markup" | |||
], | |||
) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -21,6 +21,7 @@ | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from difflib import unified_diff | |||
try: | |||
import unittest2 as unittest | |||
@@ -30,6 +31,8 @@ except ImportError: | |||
from mwparserfromhell.compat import str | |||
from mwparserfromhell.nodes import HTMLEntity, Template, Text | |||
from mwparserfromhell.nodes.extras import Parameter | |||
from mwparserfromhell import parse | |||
from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext | |||
pgens = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=True) | |||
@@ -287,7 +290,7 @@ class TestTemplate(TreeEqualityTestCase): | |||
self.assertIsInstance(node12.params[1].value.get(1), HTMLEntity) | |||
self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|\nh = i}}", node13) | |||
self.assertEqual("{{a\n|b =c\n|d = e|f =g\n|h = i\n|j =k\n}}", node14) | |||
self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |h =i}}", node15) | |||
self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |\nh = i}}", node15) | |||
self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|h=i}}", node16) | |||
self.assertEqual("{{a|b|c}}", node17) | |||
self.assertEqual("{{a|b|3=c}}", node18) | |||
@@ -439,5 +442,172 @@ class TestTemplate(TreeEqualityTestCase): | |||
self.assertEqual("{{foo|a=b|c=d|e=f|a=|a=b}}", node26) | |||
self.assertRaises(ValueError, node27.remove, node28.get(1)) | |||
def test_formatting(self): | |||
"""test realistic param manipulation with complex whitespace formatting | |||
(assumes that parsing works correctly)""" | |||
tests = [ | |||
# https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004 | |||
("""{{Infobox U.S. county | |||
| county = Lamar County | |||
| state = Georgia | |||
| seal = | |||
| founded = 1920 | |||
| seat wl = Barnesville | |||
| largest city wl = Barnesville | |||
| area_total_sq_mi = 186 | |||
| area_land_sq_mi = 184 | |||
| area_water_sq_mi = 2.3 | |||
| area percentage = 1.3% | |||
| census yr = 2010 | |||
| pop = 18317 | |||
| density_sq_mi = 100 | |||
| time zone = Eastern | |||
| footnotes = | |||
| web = www.lamarcountyga.com | |||
| ex image = Lamar County Georgia Courthouse.jpg | |||
| ex image cap = Lamar County courthouse in Barnesville | |||
| district = 3rd | |||
| named for = [[Lucius Quintus Cincinnatus Lamar II]] | |||
}}""", | |||
"""@@ -11,4 +11,4 @@ | |||
| area percentage = 1.3% | |||
-| census yr = 2010 | |||
-| pop = 18317 | |||
+| census estimate yr = 2016 | |||
+| pop = 12345<ref>example ref</ref> | |||
| density_sq_mi = 100"""), | |||
# https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760 | |||
("""{{Infobox U.S. County| | |||
county = Rockdale County | | |||
state = Georgia | | |||
seal = | | |||
founded = October 18, 1870 | | |||
seat wl = Conyers | | |||
largest city wl = Conyers | | |||
area_total_sq_mi = 132 | | |||
area_land_sq_mi = 130 | | |||
area_water_sq_mi = 2.3 | | |||
area percentage = 1.7% | | |||
census yr = 2010| | |||
pop = 85215 | | |||
density_sq_mi = 657 | | |||
web = www.rockdalecounty.org | |||
| ex image = Rockdale-county-courthouse.jpg | |||
| ex image cap = Rockdale County Courthouse in Conyers | |||
| district = 4th | |||
| time zone= Eastern | |||
}}""", | |||
"""@@ -11,4 +11,4 @@ | |||
area percentage = 1.7% | | |||
- census yr = 2010| | |||
- pop = 85215 | | |||
+ census estimate yr = 2016 | | |||
+ pop = 12345<ref>example ref</ref> | | |||
density_sq_mi = 657 |"""), | |||
# https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413 | |||
("""{{Infobox U.S. County| | |||
| county = Spalding County | | |||
| state = Georgia | | |||
| seal = | | |||
| founded = 1851 | | |||
| seat wl = Griffin | | |||
| largest city wl = Griffin | | |||
| area_total_sq_mi = 200 | | |||
| area_land_sq_mi = 196 | | |||
| area_water_sq_mi = 3.1 | | |||
| area percentage = 1.6% | | |||
| census yr = 2010| | |||
| pop = 64073 | | |||
| density_sq_mi = 326 | | |||
| web = www.spaldingcounty.com | | |||
| named for = [[Thomas Spalding]] | |||
| ex image = Spalding County Courthouse (NE corner).JPG | |||
| ex image cap = Spalding County Courthouse in Griffin | |||
| district = 3rd | |||
| time zone = Eastern | |||
}}""", | |||
"""@@ -11,4 +11,4 @@ | |||
| area percentage = 1.6% | | |||
-| census yr = 2010| | |||
-| pop = 64073 | | |||
+| | |||
+| census estimate yr = 2016 | pop = 12345<ref>example ref</ref> | | |||
| density_sq_mi = 326 |"""), | |||
# https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648 | |||
("""{{Infobox U.S. county | |||
|county = Clinton County | |||
|state = Illinois | |||
| ex image = File:Clinton County Courthouse, Carlyle.jpg | |||
| ex image cap = [[Clinton County Courthouse (Illinois)|Clinton County Courthouse]] | |||
|seal = | |||
|founded = 1824 | |||
|named for = [[DeWitt Clinton]] | |||
|seat wl= Carlyle | |||
| largest city wl = Breese | |||
|time zone=Central | |||
|area_total_sq_mi = 503 | |||
|area_land_sq_mi = 474 | |||
|area_water_sq_mi = 29 | |||
|area percentage = 5.8% | |||
|census yr = 2010 | |||
|pop = 37762 | |||
|density_sq_mi = 80 | |||
|web = www.clintonco.illinois.gov | |||
| district = 15th | |||
}}""", | |||
"""@@ -15,4 +15,4 @@ | |||
|area percentage = 5.8% | |||
- |census yr = 2010 | |||
- |pop = 37762 | |||
+ |census estimate yr = 2016 | |||
+ |pop = 12345<ref>example ref</ref> | |||
|density_sq_mi = 80"""), | |||
# https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800 | |||
("""{{Infobox U.S. county | | |||
county = Winnebago County | | |||
state = Illinois | | |||
seal = Winnebago County il seal.png | | |||
named for = [[Winnebago (tribe)|Winnebago Tribe]] | | |||
seat wl= Rockford | | |||
largest city wl = Rockford| | |||
area_total_sq_mi = 519 | | |||
area_land_sq_mi = 513| | |||
area_water_sq_mi = 5.9 | | |||
area percentage = 1.1% | | |||
census yr = 2010| | |||
pop = 295266 | | |||
density_sq_mi = 575 | |||
| web = www.wincoil.us | |||
| founded year = 1836 | |||
| founded date = January 16 | |||
| time zone = Central | |||
| district = 16th | |||
| district2 = 17th | |||
}}""", | |||
"""@@ -11,4 +11,4 @@ | |||
area percentage = 1.1% | | |||
- census yr = 2010| | |||
- pop = 295266 | | |||
+ census estimate yr = 2016| | |||
+ pop = 12345<ref>example ref</ref> | | |||
density_sq_mi = 575""")] | |||
for (original, expected) in tests: | |||
code = parse(original) | |||
template = code.filter_templates()[0] | |||
template.add("pop", "12345<ref>example ref</ref>") | |||
template.add('census estimate yr', "2016", before="pop") | |||
template.remove("census yr") | |||
oldlines = original.splitlines(True) | |||
newlines = str(code).splitlines(True) | |||
difflines = unified_diff(oldlines, newlines, n=1) | |||
diff = "".join(list(difflines)[2:]).strip() | |||
self.assertEqual(expected, diff) | |||
if __name__ == "__main__": | |||
unittest.main(verbosity=2) |