diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index 1efafd9..98d9434 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -23,6 +23,7 @@ from __future__ import print_function, unicode_literals from os import listdir, path +from mwparserfromhell.compat import py3k from mwparserfromhell.parser import tokens class _TestParseError(Exception): @@ -36,12 +37,14 @@ class TokenizerTestCase(object): def inner(self): actual = self.tokenizer().tokenize(data["input"]) self.assertEqual(actual, data["output"]) - inner.__name__ = funcname.encode("utf8") + if not py3k: + inner.__name__ = funcname.encode("utf8") inner.__doc__ = data["label"] return inner @classmethod def _load_tests(cls, filename, text): + counter = 1 tests = text.split("\n---\n") for test in tests: data = {"name": "", "label": "", "input": "", "output": []} @@ -55,7 +58,7 @@ class TokenizerTestCase(object): raw = line[len("input:"):].strip() if raw[0] == '"' and raw[-1] == '"': raw = raw[1:-1] - data["input"] = raw.decode("unicode_escape") + data["input"] = raw.encode("raw_unicode_escape").decode("unicode_escape") elif line.startswith("output:"): raw = line[len("output:"):].strip() data["output"] = eval(raw, vars(tokens)) @@ -74,9 +77,10 @@ class TokenizerTestCase(object): error = "Test {0} in {1} was ignored because it lacked an input or an output" print(error.format(data["name"], filename)) continue - funcname = "test_" + filename + "_" + data["name"] - meth = cls._build_test_method(funcname, data) - setattr(cls, funcname, meth) + fname = "test_{0}{1}_{2}".format(filename, counter, data["name"]) + meth = cls._build_test_method(fname, data) + setattr(cls, fname, meth) + counter += 1 @classmethod def build(cls): @@ -86,7 +90,9 @@ class TokenizerTestCase(object): if not filename.endswith(extension): continue with open(path.join(directory, filename), "r") as fp: - text = fp.read().decode("utf8") + text = fp.read() + if not py3k: + text = text.decode("utf8") cls._load_tests(filename[:0-len(extension)], text) TokenizerTestCase.build() diff --git a/tests/tokenizer/text.test b/tests/tokenizer/text.test index 8d97412..eb5b9b4 100644 --- a/tests/tokenizer/text.test +++ b/tests/tokenizer/text.test @@ -5,7 +5,14 @@ output: [Text(text="foobar")] --- -name: basic2 +name: newlines label: slightly more complex text parsing, with newlines -input: "This is a line of text.\nThis is another line of text." -output: [Text(text="This is a line of text.\nThis is another line of text.")] +input: "This is a line of text.\nThis is another line of text.\nThis is another." +output: [Text(text="This is a line of text.\nThis is another line of text.\nThis is another.")] + +--- + +name: unicode +label: ensure unicode data is handled properly +input: "Thís ís å sëñtënce with diœcritiçs." +output: [Text(text="Thís ís å sëñtënce with diœcritiçs.")]