Browse Source

Implement new search protocol in Wikicode.

tags/v0.3.3
Ben Kurtovic 10 years ago
parent
commit
c0fb7c030a
4 changed files with 193 additions and 175 deletions
  1. +17
    -9
      mwparserfromhell/smart_list.py
  2. +12
    -3
      mwparserfromhell/utils.py
  3. +161
    -160
      mwparserfromhell/wikicode.py
  4. +3
    -3
      tests/test_argument.py

+ 17
- 9
mwparserfromhell/smart_list.py View File

@@ -79,6 +79,11 @@ class SmartList(_SliceNormalizerMixIn, list):
[2, 3, 4]
>>> parent
[0, 1, 2, 3, 4]

The parent needs to keep a list of its children in order to update them,
which prevents them from being garbage-collected. If you are keeping the
parent around for a while but creating many children, it is advisable to
call :py:meth:`~._ListProxy.destroy` when you're finished with them.
"""

def __init__(self, iterable=None):
@@ -146,6 +151,11 @@ class SmartList(_SliceNormalizerMixIn, list):
self.extend(other)
return self

def _release_children(self):
copy = list(self)
for child in self._children:
child._parent = copy

@inheritdoc
def append(self, item):
head = len(self)
@@ -174,17 +184,13 @@ class SmartList(_SliceNormalizerMixIn, list):

@inheritdoc
def reverse(self):
copy = list(self)
for child in self._children:
child._parent = copy
self._release_children()
super(SmartList, self).reverse()

if py3k:
@inheritdoc
def sort(self, key=None, reverse=None):
copy = list(self)
for child in self._children:
child._parent = copy
self._release_children()
kwargs = {}
if key is not None:
kwargs["key"] = key
@@ -194,9 +200,7 @@ class SmartList(_SliceNormalizerMixIn, list):
else:
@inheritdoc
def sort(self, cmp=None, key=None, reverse=None):
copy = list(self)
for child in self._children:
child._parent = copy
self._release_children()
kwargs = {}
if cmp is not None:
kwargs["cmp"] = cmp
@@ -448,5 +452,9 @@ class _ListProxy(_SliceNormalizerMixIn, list):
item.sort(**kwargs)
self._parent[self._start:self._stop:self._step] = item

def destroy(self):
"""Make the parent forget this child. The child will no longer work."""
self._parent._children.pop(id(self))


del inheritdoc

+ 12
- 3
mwparserfromhell/utils.py View File

@@ -21,8 +21,8 @@
# SOFTWARE.

"""
This module contains accessory functions that wrap around existing ones to
provide additional functionality.
This module contains accessory functions for other parts of the library. Parser
users generally won't need stuff from here.
"""

from __future__ import unicode_literals
@@ -31,7 +31,16 @@ from .compat import bytes, str
from .nodes import Node
from .smart_list import SmartList

__all__ = ["parse_anything"]
__all__ = ["get_children", "parse_anything"]

def get_children(node, contexts=False, parent=None):
"""Iterate over all child :py:class:`.Node`\ s of a given *node*."""
## DON'T MAKE THIS RECURSIVE, USE A STACK!
yield (parent, node) if contexts else node
for code in node.__children__():
for descendant in code.nodes:
for child in get_children(descendant, contexts, code):
yield child

def parse_anything(value, context=0):
"""Return a :py:class:`~.Wikicode` for *value*, allowing multiple types.


+ 161
- 160
mwparserfromhell/wikicode.py View File

@@ -21,13 +21,14 @@
# SOFTWARE.

from __future__ import unicode_literals
from itertools import chain
import re

from .compat import py3k, str
from .compat import py3k, range, str
from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity,
Node, Tag, Template, Text, Wikilink)
from .string_mixin import StringMixIn
from .utils import parse_anything
from .utils import get_children, parse_anything

__all__ = ["Wikicode"]

@@ -51,107 +52,86 @@ class Wikicode(StringMixIn):
def __unicode__(self):
return "".join([str(node) for node in self.nodes])

def _get_children(self, node):
"""Iterate over all descendants of a given *node*, including itself.

This is implemented by the ``__iternodes__()`` generator of ``Node``
classes, which by default yields itself and nothing more.
"""
for context, child in node.__iternodes__(self._get_all_nodes):
yield child

def _get_all_nodes(self, code):
"""Iterate over all of our descendant nodes.

This is implemented by calling :py:meth:`_get_children` on every node
in our node list (:py:attr:`self.nodes <nodes>`).
"""
for node in code.nodes:
for child in self._get_children(node):
yield child

def _is_equivalent(self, obj, node):
"""Return ``True`` if *obj* and *node* are equivalent, else ``False``.

If *obj* is a ``Node``, the function will test whether they are the
same object, otherwise it will compare them with ``==``.
"""
return (node is obj) if isinstance(obj, Node) else (node == obj)

def _contains(self, nodes, obj):
"""Return ``True`` if *obj* is inside of *nodes*, else ``False``.

If *obj* is a ``Node``, we will only return ``True`` if *obj* is
actually in the list (and not just a node that equals it). Otherwise,
the test is simply ``obj in nodes``.
@staticmethod
def _slice_replace(code, index, old, new):
"""Replace the string *old* with *new* across *index* in *code*."""
nodes = [str(node) for node in code.get(index)]
substring = "".join(nodes).replace(old, new)
code.nodes[index] = parse_anything(substring).nodes

def _do_strong_search(self, obj, recursive=True):
"""Search for the specific element *obj* within the node list.

*obj* can be either a :py:class:`.Node` or a :py:class:`.Wikicode`
object. If found, we return a tuple (*context*, *index*) where
*context* is the :py:class:`.Wikicode` that contains *obj* and *index*
is its index there, as a :py:class:`slice`. Note that if *recursive* is
``False``, *context* will always be ``self`` (since we only look for
*obj* among immediate descendants), but if *recursive* is ``True``,
then it could be any :py:class:`.Wikicode` contained by a node within
``self``. If *obj* is not found, :py:exc:`ValueError` is raised.
"""
mkslice = lambda i: slice(i, i + 1)
if isinstance(obj, Node):
for node in nodes:
if node is obj:
return True
return False
return obj in nodes

def _prepare_search(self, obj):
"""Prepare a new search by calculating the exact parameters.
if not recursive:
return self, mkslice(self.index(obj))
for i, node in enumerate(self.nodes):
for context, child in get_children(node, contexts=True):
if obj is child:
if not context:
context = self
return context, mkslice(context.index(child))
else:
context, ind = self._do_strong_search(obj.get(0), recursive)
for i in range(1, len(obj.nodes)):
if obj.get(i) is not context.get(ind.start + i):
break
else:
return context, slice(ind.start, ind.start + len(obj.nodes))
raise ValueError(obj)

*obj*, which may be anything passable to :py:func:`.parse_anything`, is
converted to either a single :py:class:`.Node` or a
:py:class:`.Wikicode` of multiple nodes. *literal* is a boolean;
``True`` if we are searching for an exact match with ``is`` or
``False`` if we are searching for equality with ``==``.
def _do_weak_search(self, obj, recursive):
"""Search for an element that looks like *obj* within the node list.

This follows the same rules as :py:meth:`_do_strong_search` with some
differences. *obj* is treated as a string that might represent any
:py:class:`.Node`, :py:class:`.Wikicode`, or combination of the two
present in the node list. Thus, matching is weak (using string
comparisons) rather than strong (using ``is``). Because multiple nodes
can match *obj*, the result is a list of tuples instead of just one
(however, :py:exc:`ValueError` is still raised if nothing is found).
Individual matches will never overlap.

The tuples contain a new first element, *exact*, which is ``True`` if
we were able to match *obj* exactly to one or more adjacent nodes, or
``False`` if we found *obj* inside a node or incompletely spanning
multiple nodes.
"""
literal = isinstance(obj, (Node, Wikicode))
obj = parse_anything(obj)
if not obj or obj not in self:
raise ValueError(obj)
if len(obj.nodes) == 1:
obj = obj.get(0)
return obj, literal

def _do_search(self, obj, recursive, context=None, literal=None):
"""Return some info about the location of *obj* within *context*.

If *recursive* is ``True``, we'll look within *context* (``self`` by
default) and its descendants, otherwise just *context*. We raise
:py:exc:`ValueError` if *obj* isn't found. The return data is a list of
3-tuples (*type*, *context*, *data*) where *type* is *obj*\ 's best
type resolution (either ``Node``, ``Wikicode``, or ``str``), *context*
is the closest ``Wikicode`` encompassing it, and *data* is either a
``Node``, a list of ``Node``\ s, or ``None`` depending on *type*.
"""
if not context:
context = self
obj, literal = self._prepare_search(obj)
compare = (lambda a, b: a is b) if literal else (lambda a, b: a == b)
results = []
i = 0
while i < len(context.nodes):
node = context.get(i)
if isinstance(obj, Node) and compare(obj, node):
results.append((Node, context, node))
elif isinstance(obj, Wikicode) and compare(obj.get(0), node):
for j in range(1, len(obj.nodes)):
if not compare(obj.get(j), context.get(i + j)):
break
else:
nodes = list(context.nodes[i:i + len(obj.nodes)])
results.append((Wikicode, context, nodes))
i += len(obj.nodes) - 1
elif recursive and not isinstance(node, Text) and obj in node:
contexts = node.__iternodes__(self._get_all_nodes)
processed = []
for code in (ctx for ctx, child in contexts):
if code and code not in processed and obj in code:
search = self._do_search(obj, recursive, code, literal)
results.extend(search)
processed.append(code)
i += 1

if not results and not literal and recursive:
results.append((str, context, None))
if not results and context is self:
raise ValueError(obj)
contexts = [self]
while contexts:
context = contexts.pop()
i = len(context.nodes) - 1
while i >= 0:
node = context.get(i)
if obj.get(-1) == node:
for j in range(-len(obj.nodes), -1):
if obj.get(j) != context.get(i + j + 1):
break
else:
i -= len(obj.nodes) - 1
index = slice(i, i + len(obj.nodes))
results.append((True, context, index))
elif recursive and obj in node:
contexts.extend(node.__children__())
i -= 1
if not results:
if not recursive:
raise ValueError(obj)
results.append((False, self, slice(0, len(self.nodes))))
return results

def _get_tree(self, code, lines, marker, indent):
@@ -256,15 +236,15 @@ class Wikicode(StringMixIn):
return the index of our direct descendant node within *our* list of
nodes. Otherwise, the lookup is done only on direct descendants.
"""
if recursive:
for i, node in enumerate(self.nodes):
if self._contains(self._get_children(node), obj):
return i
raise ValueError(obj)

strict = isinstance(obj, Node)
equivalent = (lambda o, n: o is n) if strict else (lambda o, n: o == n)
for i, node in enumerate(self.nodes):
if self._is_equivalent(obj, node):
return i
if recursive:
for child in get_children(node):
if equivalent(obj, child):
return i
elif equivalent(obj, node):
return i
raise ValueError(obj)

def insert(self, index, value):
@@ -279,66 +259,79 @@ class Wikicode(StringMixIn):
self.nodes.insert(index, node)

def insert_before(self, obj, value, recursive=True):
"""Insert *value* immediately before *obj* in the list of nodes.
"""Insert *value* immediately before *obj*.

*obj* can be either a string, a :py:class:`~.Node`, or other
*obj* can be either a string, a :py:class:`~.Node`, or another
:py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`,
for example). *value* can be anything parasable by
:py:func:`.parse_anything`. If *recursive* is ``True``, we will try to
find *obj* within our child nodes even if it is not a direct descendant
of this :py:class:`~.Wikicode` object. If *obj* is not found,
for example). If *obj* is a string, we will operate on all instances
of that string within the code, otherwise only on the specific instance
given. *value* can be anything parasable by :py:func:`.parse_anything`.
If *recursive* is ``True``, we will try to find *obj* within our child
nodes even if it is not a direct descendant of this
:py:class:`~.Wikicode` object. If *obj* is not found,
:py:exc:`ValueError` is raised.
"""
for restype, context, data in self._do_search(obj, recursive):
if restype in (Node, Wikicode):
i = context.index(data if restype is Node else data[0], False)
context.insert(i, value)
else:
obj = str(obj)
context.nodes = str(context).replace(obj, str(value) + obj)
if isinstance(obj, (Node, Wikicode)):
context, index = self._do_strong_search(obj, recursive)
context.insert(index.start, value)
else:
for exact, context, index in self._do_weak_search(obj, recursive):
if exact:
context.insert(index.start, value)
else:
obj = str(obj)
self._slice_replace(context, index, obj, str(value) + obj)

def insert_after(self, obj, value, recursive=True):
"""Insert *value* immediately after *obj* in the list of nodes.
"""Insert *value* immediately after *obj*.

*obj* can be either a string, a :py:class:`~.Node`, or other
*obj* can be either a string, a :py:class:`~.Node`, or another
:py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`,
for example). *value* can be anything parasable by
:py:func:`.parse_anything`. If *recursive* is ``True``, we will try to
find *obj* within our child nodes even if it is not a direct descendant
of this :py:class:`~.Wikicode` object. If *obj* is not found,
for example). If *obj* is a string, we will operate on all instances
of that string within the code, otherwise only on the specific instance
given. *value* can be anything parasable by :py:func:`.parse_anything`.
If *recursive* is ``True``, we will try to find *obj* within our child
nodes even if it is not a direct descendant of this
:py:class:`~.Wikicode` object. If *obj* is not found,
:py:exc:`ValueError` is raised.
"""
for restype, context, data in self._do_search(obj, recursive):
if restype in (Node, Wikicode):
i = context.index(data if restype is Node else data[-1], False)
context.insert(i + 1, value)
else:
obj = str(obj)
context.nodes = str(context).replace(obj, obj + str(value))
if isinstance(obj, (Node, Wikicode)):
context, index = self._do_strong_search(obj, recursive)
context.insert(index.stop, value)
else:
for exact, context, index in self._do_weak_search(obj, recursive):
if exact:
context.insert(index.stop, value)
else:
obj = str(obj)
self._slice_replace(context, index, obj, obj + str(value))

def replace(self, obj, value, recursive=True):
"""Replace *obj* with *value* in the list of nodes.
"""Replace *obj* with *value*.

*obj* can be either a string, a :py:class:`~.Node`, or other
*obj* can be either a string, a :py:class:`~.Node`, or another
:py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`,
for example). *value* can be anything parasable by
:py:func:`.parse_anything`. If *recursive* is ``True``, we will try to
find *obj* within our child nodes even if it is not a direct descendant
of this :py:class:`~.Wikicode` object. If *obj* is not found,
for example). If *obj* is a string, we will operate on all instances
of that string within the code, otherwise only on the specific instance
given. *value* can be anything parasable by :py:func:`.parse_anything`.
If *recursive* is ``True``, we will try to find *obj* within our child
nodes even if it is not a direct descendant of this
:py:class:`~.Wikicode` object. If *obj* is not found,
:py:exc:`ValueError` is raised.
"""
for restype, context, data in self._do_search(obj, recursive):
if restype is Node:
i = context.index(data, False)
context.nodes.pop(i)
context.insert(i, value)
elif restype is Wikicode:
i = context.index(data[0], False)
for _ in data:
context.nodes.pop(i)
context.insert(i, value)
else:
context.nodes = str(context).replace(str(obj), str(value))
if isinstance(obj, (Node, Wikicode)):
context, index = self._do_strong_search(obj, recursive)
for i in range(index.start, index.stop):
context.nodes.pop(index.start)
context.insert(index.start, value)
else:
for exact, context, index in self._do_weak_search(obj, recursive):
if exact:
for i in range(index.start, index.stop):
context.nodes.pop(index.start)
context.insert(index.start, value)
else:
self._slice_replace(context, index, str(obj), str(value))

def append(self, value):
"""Insert *value* at the end of the list of nodes.
@@ -352,22 +345,26 @@ class Wikicode(StringMixIn):
def remove(self, obj, recursive=True):
"""Remove *obj* from the list of nodes.

*obj* can be either a string, a :py:class:`~.Node`, or other
*obj* can be either a string, a :py:class:`~.Node`, or another
:py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`,
for example). If *recursive* is ``True``, we will try to find *obj*
within our child nodes even if it is not a direct descendant of this
for example). If *obj* is a string, we will operate on all instances
of that string within the code, otherwise only on the specific instance
given. If *recursive* is ``True``, we will try to find *obj* within our
child nodes even if it is not a direct descendant of this
:py:class:`~.Wikicode` object. If *obj* is not found,
:py:exc:`ValueError` is raised.
"""
for restype, context, data in self._do_search(obj, recursive):
if restype is Node:
context.nodes.pop(context.index(data, False))
elif restype is Wikicode:
i = context.index(data[0], False)
for _ in data:
context.nodes.pop(i)
else:
context.nodes = str(context).replace(str(obj), "")
if isinstance(obj, (Node, Wikicode)):
context, index = self._do_strong_search(obj, recursive)
for i in range(index.start, index.stop):
context.nodes.pop(index.start)
else:
for exact, context, index in self._do_weak_search(obj, recursive):
if exact:
for i in range(index.start, index.stop):
context.nodes.pop(index.start)
else:
self._slice_replace(context, index, str(obj), "")

def matches(self, other):
"""Do a loose equivalency test suitable for comparing page names.
@@ -407,7 +404,11 @@ class Wikicode(StringMixIn):
"""
if matches and not callable(matches):
pat, matches = matches, lambda obj: re.search(pat, str(obj), flags)
for node in (self._get_all_nodes(self) if recursive else self.nodes):
if recursive:
nodes = chain.from_iterable(get_children(n) for n in self.nodes)
else:
nodes = self.nodes
for node in nodes:
if not forcetype or isinstance(node, forcetype):
if not matches or matches(node):
yield node


+ 3
- 3
tests/test_argument.py View File

@@ -44,9 +44,9 @@ class TestArgument(TreeEqualityTestCase):
node2 = Argument(wraptext("foo"), wrap([Text("bar"), Text("baz")]))
gen1 = node1.__children__()
gen2 = node2.__children__()
self.assertIs(node1.name, gen1)
self.assertIs(node2.name, gen2)
self.assertIs(node2.default, gen2)
self.assertIs(node1.name, next(gen1))
self.assertIs(node2.name, next(gen2))
self.assertIs(node2.default, next(gen2))
self.assertRaises(StopIteration, next, gen1)
self.assertRaises(StopIteration, next, gen2)



Loading…
Cancel
Save