From 4636fbeb4a46e76b5d04a9c439758ed042eea7eb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 3 Feb 2013 02:10:36 -0500 Subject: [PATCH] Built an infrastructure for loading and running tokenizer tests. --- tests/_test_tokenizer.py | 74 +++++++++++++++++++++++++++++++++++++++++++---- tests/test_ctokenizer.py | 4 +-- tests/test_pytokenizer.py | 4 +-- tests/tokenizer/text.test | 11 +++++++ 4 files changed, 84 insertions(+), 9 deletions(-) create mode 100644 tests/tokenizer/text.test diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index 29f4e37..1efafd9 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -20,9 +20,73 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -class TestTokenizer(): - def tokenize(self, text): - return self.tokenizer().tokenize(text) +from __future__ import print_function, unicode_literals +from os import listdir, path - def test_basic(self): - self.assertEqual(1, 1) +from mwparserfromhell.parser import tokens + +class _TestParseError(Exception): + """Raised internally when a test could not be parsed.""" + pass + + +class TokenizerTestCase(object): + @classmethod + def _build_test_method(cls, funcname, data): + def inner(self): + actual = self.tokenizer().tokenize(data["input"]) + self.assertEqual(actual, data["output"]) + inner.__name__ = funcname.encode("utf8") + inner.__doc__ = data["label"] + return inner + + @classmethod + def _load_tests(cls, filename, text): + tests = text.split("\n---\n") + for test in tests: + data = {"name": "", "label": "", "input": "", "output": []} + try: + for line in test.strip().splitlines(): + if line.startswith("name:"): + data["name"] = line[len("name:"):].strip() + elif line.startswith("label:"): + data["label"] = line[len("label:"):].strip() + elif line.startswith("input:"): + raw = line[len("input:"):].strip() + if raw[0] == '"' and raw[-1] == '"': + raw = raw[1:-1] + data["input"] = raw.decode("unicode_escape") + elif line.startswith("output:"): + raw = line[len("output:"):].strip() + data["output"] = eval(raw, vars(tokens)) + except _TestParseError: + if data["name"]: + error = "Could not parse test {0} in {1}" + print(error.format(data["name"], filename)) + else: + print("Could not parse a test in {0}".format(filename)) + continue + if not data["name"]: + error = "A test in {0} was ignored because it lacked a name" + print(error.format(filename)) + continue + if not data["input"] or not data["output"]: + error = "Test {0} in {1} was ignored because it lacked an input or an output" + print(error.format(data["name"], filename)) + continue + funcname = "test_" + filename + "_" + data["name"] + meth = cls._build_test_method(funcname, data) + setattr(cls, funcname, meth) + + @classmethod + def build(cls): + directory = path.join(path.dirname(__file__), "tokenizer") + extension = ".test" + for filename in listdir(directory): + if not filename.endswith(extension): + continue + with open(path.join(directory, filename), "r") as fp: + text = fp.read().decode("utf8") + cls._load_tests(filename[:0-len(extension)], text) + +TokenizerTestCase.build() diff --git a/tests/test_ctokenizer.py b/tests/test_ctokenizer.py index e5a7aef..7d3ffd7 100644 --- a/tests/test_ctokenizer.py +++ b/tests/test_ctokenizer.py @@ -22,9 +22,9 @@ import unittest -from _test_tokenizer import TestTokenizer +from _test_tokenizer import TokenizerTestCase -class TestCTokenizer(unittest.TestCase, TestTokenizer): +class TestCTokenizer(TokenizerTestCase, unittest.TestCase): @classmethod def setUpClass(cls): from mwparserfromhell.parser._tokenizer import CTokenizer diff --git a/tests/test_pytokenizer.py b/tests/test_pytokenizer.py index 01855f7..f739726 100644 --- a/tests/test_pytokenizer.py +++ b/tests/test_pytokenizer.py @@ -22,9 +22,9 @@ import unittest -from _test_tokenizer import TestTokenizer +from _test_tokenizer import TokenizerTestCase -class TestPyTokenizer(unittest.TestCase, TestTokenizer): +class TestPyTokenizer(TokenizerTestCase, unittest.TestCase): @classmethod def setUpClass(cls): from mwparserfromhell.parser.tokenizer import Tokenizer diff --git a/tests/tokenizer/text.test b/tests/tokenizer/text.test new file mode 100644 index 0000000..8d97412 --- /dev/null +++ b/tests/tokenizer/text.test @@ -0,0 +1,11 @@ +name: basic +label: sanity check for basic text parsing, no gimmicks +input: "foobar" +output: [Text(text="foobar")] + +--- + +name: basic2 +label: slightly more complex text parsing, with newlines +input: "This is a line of text.\nThis is another line of text." +output: [Text(text="This is a line of text.\nThis is another line of text.")]