Browse Source

Merge develop into master (release/0.5.1)

undefined
Ben Kurtovic 6 years ago
parent
commit
c8df09469e
14 changed files with 265 additions and 30 deletions
  1. +7
    -0
      CHANGELOG
  2. +1
    -1
      LICENSE
  3. +1
    -1
      appveyor.yml
  4. +13
    -0
      docs/changelog.rst
  5. +1
    -1
      docs/conf.py
  6. +3
    -3
      mwparserfromhell/__init__.py
  7. +3
    -1
      mwparserfromhell/nodes/template.py
  8. +12
    -4
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  9. +19
    -9
      mwparserfromhell/parser/ctokenizer/tok_support.c
  10. +2
    -1
      mwparserfromhell/parser/ctokenizer/tok_support.h
  11. +27
    -6
      mwparserfromhell/parser/tokenizer.py
  12. +2
    -0
      scripts/release.sh
  13. +2
    -1
      setup.py
  14. +172
    -2
      tests/test_template.py

+ 7
- 0
CHANGELOG View File

@@ -1,3 +1,10 @@
v0.5.1 (released March 03, 2018):

- Improved behavior when adding parameters to templates (via Template.add())
with poorly formatted whitespace conventions. (#185)
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
quoted attributes. (#190)

v0.5 (released June 23, 2017):

- Added Wikicode.contains() to determine whether a Node or Wikicode object is


+ 1
- 1
LICENSE View File

@@ -1,4 +1,4 @@
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal


+ 1
- 1
appveyor.yml View File

@@ -1,6 +1,6 @@
# This config file is used by appveyor.com to build Windows release binaries

version: 0.5-b{build}
version: 0.5.1-b{build}

branches:
only:


+ 13
- 0
docs/changelog.rst View File

@@ -1,6 +1,19 @@
Changelog
=========

v0.5.1
------

`Released March 03, 2018 <https://github.com/earwig/mwparserfromhell/tree/v0.5.1>`_
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.5...v0.5.1>`__):

- Improved behavior when adding parameters to templates (via
:meth:`.Template.add`) with poorly formatted whitespace conventions.
(`#185 <https://github.com/earwig/mwparserfromhell/issues/185>`_)
- Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
quoted attributes.
(`#190 <https://github.com/earwig/mwparserfromhell/issues/190>`_)

v0.5
----



+ 1
- 1
docs/conf.py View File

@@ -42,7 +42,7 @@ master_doc = 'index'

# General information about the project.
project = u'mwparserfromhell'
copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic'
copyright = u'2012–2018 Ben Kurtovic'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the


+ 3
- 3
mwparserfromhell/__init__.py View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -27,9 +27,9 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode.
"""

__author__ = "Ben Kurtovic"
__copyright__ = "Copyright (C) 2012, 2013, 2014, 2015, 2016 Ben Kurtovic"
__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic"
__license__ = "MIT License"
__version__ = "0.5"
__version__ = "0.5.1"
__email__ = "ben.kurtovic@gmail.com"

from . import (compat, definitions, nodes, parser, smart_list, string_mixin,


+ 3
- 1
mwparserfromhell/nodes/template.py View File

@@ -101,7 +101,7 @@ class Template(Node):
values = tuple(theories.values())
best = max(values)
confidence = float(best) / sum(values)
if confidence >= 0.75:
if confidence > 0.5:
return tuple(theories.keys())[values.index(best)]

@staticmethod
@@ -130,6 +130,8 @@ class Template(Node):
before_theories = defaultdict(lambda: 0)
after_theories = defaultdict(lambda: 0)
for param in self.params:
if not param.showkey:
continue
if use_names:
component = str(param.name)
else:


+ 12
- 4
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -1,5 +1,5 @@
/*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
*/
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
{
#define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS
#define NOT_A_LINK \
if (!brackets && self->topstack->context & LC_DLTERM) \
return Tokenizer_handle_dl_term(self); \
@@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
PyObject *link, *kwargs;
Textbuffer *extra;

if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
if (self->topstack->context & AGG_NO_EXT_LINKS ||
!(Tokenizer_CAN_RECURSE(self))) {
NOT_A_LINK;
}
extra = Textbuffer_new(&self->text);
@@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data(
else if (data->context & TAG_NOTE_SPACE) {
if (data->context & TAG_QUOTED) {
data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset - 1; // Will be auto-incremented
@@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data(
data->context |= TAG_QUOTED;
data->quoter = chunk;
data->reset = self->head;
if (Tokenizer_push(self, self->topstack->context))
if (Tokenizer_check_route(self, self->topstack->context) < 0) {
RESET_ROUTE();
data->context = TAG_ATTR_VALUE;
self->head--;
}
else if (Tokenizer_push(self, self->topstack->context))
return -1;
return 0;
}
@@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset;
@@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE;
Tokenizer_memoize_bad_route(self);
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset;


+ 19
- 9
mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -1,5 +1,5 @@
/*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -147,6 +147,22 @@ static int compare_nodes(
}

/*
Remember that the current route (head + context at push) is invalid.

This will be noticed when calling Tokenizer_check_route with the same head
and context, and the route will be failed immediately.
*/
void Tokenizer_memoize_bad_route(Tokenizer *self)
{
route_tree_node *node = malloc(sizeof(route_tree_node));
if (node) {
node->id = self->topstack->ident;
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
free(node);
}
}

/*
Fail the current tokenization route. Discards the current
stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the
ident of the failed stack so future parsing attempts down this route can be
@@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
uint64_t context = self->topstack->context;
PyObject* stack;

route_tree_node *node = malloc(sizeof(route_tree_node));
if (node) {
node->id = self->topstack->ident;
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
free(node);
}

Tokenizer_memoize_bad_route(self);
stack = Tokenizer_pop(self);
Py_XDECREF(stack);
FAIL_ROUTE(context);
@@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
/*
Check if pushing a new route here with the given context would definitely
fail, based on a previous call to Tokenizer_fail_route() with the same
stack.
stack. (Or any other call to Tokenizer_memoize_bad_route().)

Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the
latter case.


+ 2
- 1
mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -1,5 +1,5 @@
/*
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*);
void Tokenizer_delete_top_of_stack(Tokenizer*);
PyObject* Tokenizer_pop(Tokenizer*);
PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
void Tokenizer_memoize_bad_route(Tokenizer*);
void* Tokenizer_fail_route(Tokenizer*);
int Tokenizer_check_route(Tokenizer*, uint64_t);
void Tokenizer_free_bad_route_tree(Tokenizer*);


+ 27
- 6
mwparserfromhell/parser/tokenizer.py View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -144,6 +144,14 @@ class Tokenizer(object):
"""Return whether or not our max recursion depth has been exceeded."""
return self._depth < self.MAX_DEPTH

def _memoize_bad_route(self):
"""Remember that the current route (head + context at push) is invalid.

This will be noticed when calling _push with the same head and context,
and the route will be failed immediately.
"""
self._bad_routes.add(self._stack_ident)

def _fail_route(self):
"""Fail the current tokenization route.

@@ -151,7 +159,7 @@ class Tokenizer(object):
:exc:`.BadRoute`.
"""
context = self._context
self._bad_routes.add(self._stack_ident)
self._memoize_bad_route()
self._pop()
raise BadRoute(context)

@@ -506,12 +514,16 @@ class Tokenizer(object):

def _parse_external_link(self, brackets):
"""Parse an external link at the head of the wikicode string."""
if self._context & contexts.NO_EXT_LINKS or not self._can_recurse():
if not brackets and self._context & contexts.DL_TERM:
self._handle_dl_term()
else:
self._emit_text(self._read())
return

reset = self._head
self._head += 1
try:
bad_context = self._context & contexts.NO_EXT_LINKS
if bad_context or not self._can_recurse():
raise BadRoute()
link, extra, delta = self._really_parse_external_link(brackets)
except BadRoute:
self._head = reset
@@ -719,6 +731,7 @@ class Tokenizer(object):
elif data.context & data.CX_NOTE_SPACE:
if data.context & data.CX_QUOTED:
data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop()
self._head = data.reset - 1 # Will be auto-incremented
return # Break early
@@ -743,7 +756,13 @@ class Tokenizer(object):
data.context |= data.CX_QUOTED
data.quoter = chunk
data.reset = self._head
self._push(self._context)
try:
self._push(self._context)
except BadRoute:
# Already failed to parse this as a quoted string
data.context = data.CX_ATTR_VALUE
self._head -= 1
return
continue
elif data.context & data.CX_QUOTED:
if chunk == data.quoter and not escaped:
@@ -845,6 +864,7 @@ class Tokenizer(object):
if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop()
self._head = data.reset
continue
@@ -1084,6 +1104,7 @@ class Tokenizer(object):
if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE
self._memoize_bad_route()
self._pop()
self._head = data.reset
continue


+ 2
- 0
scripts/release.sh View File

@@ -1,5 +1,7 @@
#! /usr/bin/env bash

set -euo pipefail

if [[ -z "$1" ]]; then
echo "usage: $0 1.2.3"
exit 1


+ 2
- 1
setup.py View File

@@ -1,7 +1,7 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -107,6 +107,7 @@ setup(
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Topic :: Text Processing :: Markup"
],
)

+ 172
- 2
tests/test_template.py View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,7 @@
# SOFTWARE.

from __future__ import unicode_literals
from difflib import unified_diff

try:
import unittest2 as unittest
@@ -30,6 +31,8 @@ except ImportError:
from mwparserfromhell.compat import str
from mwparserfromhell.nodes import HTMLEntity, Template, Text
from mwparserfromhell.nodes.extras import Parameter
from mwparserfromhell import parse

from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext

pgens = lambda k, v: Parameter(wraptext(k), wraptext(v), showkey=True)
@@ -287,7 +290,7 @@ class TestTemplate(TreeEqualityTestCase):
self.assertIsInstance(node12.params[1].value.get(1), HTMLEntity)
self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|\nh = i}}", node13)
self.assertEqual("{{a\n|b =c\n|d = e|f =g\n|h = i\n|j =k\n}}", node14)
self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |h =i}}", node15)
self.assertEqual("{{a|b = c\n|\nd = e|\nf =g |\nh = i}}", node15)
self.assertEqual("{{a|\nb = c|\nd = e|\nf = g|h=i}}", node16)
self.assertEqual("{{a|b|c}}", node17)
self.assertEqual("{{a|b|3=c}}", node18)
@@ -439,5 +442,172 @@ class TestTemplate(TreeEqualityTestCase):
self.assertEqual("{{foo|a=b|c=d|e=f|a=|a=b}}", node26)
self.assertRaises(ValueError, node27.remove, node28.get(1))

def test_formatting(self):
"""test realistic param manipulation with complex whitespace formatting
(assumes that parsing works correctly)"""
tests = [
# https://en.wikipedia.org/w/index.php?title=Lamar_County,_Georgia&oldid=792356004
("""{{Infobox U.S. county
| county = Lamar County
| state = Georgia
| seal =
| founded = 1920
| seat wl = Barnesville
| largest city wl = Barnesville
| area_total_sq_mi = 186
| area_land_sq_mi = 184
| area_water_sq_mi = 2.3
| area percentage = 1.3%
| census yr = 2010
| pop = 18317
| density_sq_mi = 100
| time zone = Eastern
| footnotes =
| web = www.lamarcountyga.com
| ex image = Lamar County Georgia Courthouse.jpg
| ex image cap = Lamar County courthouse in Barnesville
| district = 3rd
| named for = [[Lucius Quintus Cincinnatus Lamar II]]
}}""",
"""@@ -11,4 +11,4 @@
| area percentage = 1.3%
-| census yr = 2010
-| pop = 18317
+| census estimate yr = 2016
+| pop = 12345<ref>example ref</ref>
| density_sq_mi = 100"""),

# https://en.wikipedia.org/w/index.php?title=Rockdale_County,_Georgia&oldid=792359760
("""{{Infobox U.S. County|
county = Rockdale County |
state = Georgia |
seal = |
founded = October 18, 1870 |
seat wl = Conyers |
largest city wl = Conyers |
area_total_sq_mi = 132 |
area_land_sq_mi = 130 |
area_water_sq_mi = 2.3 |
area percentage = 1.7% |
census yr = 2010|
pop = 85215 |
density_sq_mi = 657 |
web = www.rockdalecounty.org
| ex image = Rockdale-county-courthouse.jpg
| ex image cap = Rockdale County Courthouse in Conyers
| district = 4th
| time zone= Eastern
}}""",
"""@@ -11,4 +11,4 @@
area percentage = 1.7% |
- census yr = 2010|
- pop = 85215 |
+ census estimate yr = 2016 |
+ pop = 12345<ref>example ref</ref> |
density_sq_mi = 657 |"""),

# https://en.wikipedia.org/w/index.php?title=Spalding_County,_Georgia&oldid=792360413
("""{{Infobox U.S. County|
| county = Spalding County |
| state = Georgia |
| seal = |
| founded = 1851 |
| seat wl = Griffin |
| largest city wl = Griffin |
| area_total_sq_mi = 200 |
| area_land_sq_mi = 196 |
| area_water_sq_mi = 3.1 |
| area percentage = 1.6% |
| census yr = 2010|
| pop = 64073 |
| density_sq_mi = 326 |
| web = www.spaldingcounty.com |
| named for = [[Thomas Spalding]]
| ex image = Spalding County Courthouse (NE corner).JPG
| ex image cap = Spalding County Courthouse in Griffin
| district = 3rd
| time zone = Eastern
}}""",
"""@@ -11,4 +11,4 @@
| area percentage = 1.6% |
-| census yr = 2010|
-| pop = 64073 |
+|
+| census estimate yr = 2016 | pop = 12345<ref>example ref</ref> |
| density_sq_mi = 326 |"""),

# https://en.wikipedia.org/w/index.php?title=Clinton_County,_Illinois&oldid=794694648
("""{{Infobox U.S. county
|county = Clinton County
|state = Illinois
| ex image = File:Clinton County Courthouse, Carlyle.jpg
| ex image cap = [[Clinton County Courthouse (Illinois)|Clinton County Courthouse]]
|seal =
|founded = 1824
|named for = [[DeWitt Clinton]]
|seat wl= Carlyle
| largest city wl = Breese
|time zone=Central
|area_total_sq_mi = 503
|area_land_sq_mi = 474
|area_water_sq_mi = 29
|area percentage = 5.8%
|census yr = 2010
|pop = 37762
|density_sq_mi = 80
|web = www.clintonco.illinois.gov
| district = 15th
}}""",
"""@@ -15,4 +15,4 @@
|area percentage = 5.8%
- |census yr = 2010
- |pop = 37762
+ |census estimate yr = 2016
+ |pop = 12345<ref>example ref</ref>
|density_sq_mi = 80"""),

# https://en.wikipedia.org/w/index.php?title=Winnebago_County,_Illinois&oldid=789193800
("""{{Infobox U.S. county |
county = Winnebago County |
state = Illinois |
seal = Winnebago County il seal.png |
named for = [[Winnebago (tribe)|Winnebago Tribe]] |
seat wl= Rockford |
largest city wl = Rockford|
area_total_sq_mi = 519 |
area_land_sq_mi = 513|
area_water_sq_mi = 5.9 |
area percentage = 1.1% |
census yr = 2010|
pop = 295266 |
density_sq_mi = 575
| web = www.wincoil.us
| founded year = 1836
| founded date = January 16
| time zone = Central
| district = 16th
| district2 = 17th
}}""",
"""@@ -11,4 +11,4 @@
area percentage = 1.1% |
- census yr = 2010|
- pop = 295266 |
+ census estimate yr = 2016|
+ pop = 12345<ref>example ref</ref> |
density_sq_mi = 575""")]

for (original, expected) in tests:
code = parse(original)
template = code.filter_templates()[0]
template.add("pop", "12345<ref>example ref</ref>")
template.add('census estimate yr', "2016", before="pop")
template.remove("census yr")

oldlines = original.splitlines(True)
newlines = str(code).splitlines(True)
difflines = unified_diff(oldlines, newlines, n=1)
diff = "".join(list(difflines)[2:]).strip()
self.assertEqual(expected, diff)

if __name__ == "__main__":
unittest.main(verbosity=2)

Loading…
Cancel
Save