Browse Source

Fully fix #74. Add another tokenizer test.

tags/v0.4
Ben Kurtovic 10 years ago
parent
commit
02eff0fc49
4 changed files with 32 additions and 12 deletions
  1. +0
    -2
      mwparserfromhell/compat.py
  2. +13
    -5
      mwparserfromhell/parser/tokenizer.c
  3. +12
    -5
      mwparserfromhell/parser/tokenizer.py
  4. +7
    -0
      tests/tokenizer/integration.mwtest

+ 0
- 2
mwparserfromhell/compat.py View File

@@ -20,7 +20,6 @@ if py3k:
range = range
maxsize = sys.maxsize
import html.entities as htmlentities
zip = zip

else:
bytes = str
@@ -28,6 +27,5 @@ else:
range = xrange
maxsize = sys.maxint
import htmlentitydefs as htmlentities
from itertools import izip as zip

del sys

+ 13
- 5
mwparserfromhell/parser/tokenizer.c View File

@@ -1896,18 +1896,26 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self)
{
PyObject *token = 0, *padding, *kwargs;
Py_ssize_t len, index;
int is_instance;
int depth = 1, is_instance;

len = PyList_GET_SIZE(self->topstack->stack);
for (index = len - 1; index >= 0; index--) {
for (index = 2; index < len; index++) {
token = PyList_GET_ITEM(self->topstack->stack, index);
is_instance = PyObject_IsInstance(token, TagCloseOpen);
is_instance = PyObject_IsInstance(token, TagOpenOpen);
if (is_instance == -1)
return NULL;
else if (is_instance == 1)
break;
depth++;
is_instance = PyObject_IsInstance(token, TagCloseOpen);
if (is_instance == -1)
return NULL;
else if (is_instance == 1) {
depth--;
if (depth == 0)
break;
}
}
if (!token)
if (!token || depth > 0)
return NULL;
padding = PyObject_GetAttrString(token, "padding");
if (!padding)


+ 12
- 5
mwparserfromhell/parser/tokenizer.py View File

@@ -25,7 +25,7 @@ from math import log
import re

from . import contexts, tokens
from ..compat import htmlentities, range, zip
from ..compat import htmlentities, range
from ..definitions import (get_html_tag, is_parsable, is_single,
is_single_only, is_scheme)

@@ -752,11 +752,18 @@ class Tokenizer(object):
def _handle_single_tag_end(self):
"""Handle the stream end when inside a single-supporting HTML tag."""
stack = self._stack
gen = zip(range(len(stack) - 1, -1, -1), reversed(stack))
index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen))
# We need to find the index of the TagCloseOpen token corresponding to
# the TagOpenOpen token located at index 0:
depth = 1
for index, token in enumerate(stack[2:], 2):
if isinstance(token, tokens.TagOpenOpen):
depth += 1
elif isinstance(token, tokens.TagCloseOpen):
depth -= 1
if depth == 0:
break
padding = stack[index].padding
token = tokens.TagCloseSelfclose(padding=padding, implicit=True)
stack[index] = token
stack[index] = tokens.TagCloseSelfclose(padding=padding, implicit=True)
return self._pop()

def _really_parse_tag(self):


+ 7
- 0
tests/tokenizer/integration.mwtest View File

@@ -178,3 +178,10 @@ name: external_link_inside_wikilink_title
label: an external link inside a wikilink title, which is invalid
input: "[[File:Example.png http://example.com]]"
output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()]

---

name: italics_inside_external_link_inside_incomplete_list
label: italic text inside an external link inside an incomplete list
input: "<li>[http://www.example.com ''example'']"
output: [TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding="", implicit=True), ExternalLinkOpen(brackets=True), Text(text="http://www.example.com"), ExternalLinkSeparator(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="example"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()]

Loading…
Cancel
Save