Built an infrastructure for loading and running tokenizer tests.

12 years ago · 4636fbeb4a
--- a/tests/_test_tokenizer.py
+++ b/tests/_test_tokenizer.py
@@ -20,9 +20,73 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 class TestTokenizer():
    def tokenize(self, text):
        return self.tokenizer().tokenize(text)
 from __future__ import print_function, unicode_literals
 from os import listdir, path

    def test_basic(self):
        self.assertEqual(1, 1)
 from mwparserfromhell.parser import tokens

 class _TestParseError(Exception):
    """Raised internally when a test could not be parsed."""
    pass


 class TokenizerTestCase(object):
    @classmethod
    def _build_test_method(cls, funcname, data):
        def inner(self):
            actual = self.tokenizer().tokenize(data["input"])
            self.assertEqual(actual, data["output"])
        inner.__name__ = funcname.encode("utf8")
        inner.__doc__ = data["label"]
        return inner

    @classmethod
    def _load_tests(cls, filename, text):
        tests = text.split("\n---\n")
        for test in tests:
            data = {"name": "", "label": "", "input": "", "output": []}
            try:
                for line in test.strip().splitlines():
                    if line.startswith("name:"):
                        data["name"] = line[len("name:"):].strip()
                    elif line.startswith("label:"):
                        data["label"] = line[len("label:"):].strip()
                    elif line.startswith("input:"):
                        raw = line[len("input:"):].strip()
                        if raw[0] == '"' and raw[-1] == '"':
                            raw = raw[1:-1]
                        data["input"] = raw.decode("unicode_escape")
                    elif line.startswith("output:"):
                        raw = line[len("output:"):].strip()
                        data["output"] = eval(raw, vars(tokens))
            except _TestParseError:
                if data["name"]:
                    error = "Could not parse test {0} in {1}"
                    print(error.format(data["name"], filename))
                else:
                    print("Could not parse a test in {0}".format(filename))
                continue
            if not data["name"]:
                error = "A test in {0} was ignored because it lacked a name"
                print(error.format(filename))
                continue
            if not data["input"] or not data["output"]:
                error = "Test {0} in {1} was ignored because it lacked an input or an output"
                print(error.format(data["name"], filename))
                continue
            funcname = "test_" + filename + "_" + data["name"]
            meth = cls._build_test_method(funcname, data)
            setattr(cls, funcname, meth)

    @classmethod
    def build(cls):
        directory = path.join(path.dirname(__file__), "tokenizer")
        extension = ".test"
        for filename in listdir(directory):
            if not filename.endswith(extension):
                continue
            with open(path.join(directory, filename), "r") as fp:
                text = fp.read().decode("utf8")
                cls._load_tests(filename[:0-len(extension)], text)

 TokenizerTestCase.build()
--- a/tests/test_ctokenizer.py
+++ b/tests/test_ctokenizer.py
@@ -22,9 +22,9 @@

 import unittest

 from _test_tokenizer import TestTokenizer
 from _test_tokenizer import TokenizerTestCase

 class TestCTokenizer(unittest.TestCase, TestTokenizer):
 class TestCTokenizer(TokenizerTestCase, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        from mwparserfromhell.parser._tokenizer import CTokenizer
--- a/tests/test_pytokenizer.py
+++ b/tests/test_pytokenizer.py
@@ -22,9 +22,9 @@

 import unittest

 from _test_tokenizer import TestTokenizer
 from _test_tokenizer import TokenizerTestCase

 class TestPyTokenizer(unittest.TestCase, TestTokenizer):
 class TestPyTokenizer(TokenizerTestCase, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        from mwparserfromhell.parser.tokenizer import Tokenizer
--- a/tests/tokenizer/text.test
+++ b/tests/tokenizer/text.test
@@ -0,0 +1,11 @@
 name:   basic
 label:  sanity check for basic text parsing, no gimmicks
 input:  "foobar"
 output: [Text(text="foobar")]

 ---

 name:   basic2
 label:  slightly more complex text parsing, with newlines
 input:  "This is a line of text.\nThis is another line of text."
 output: [Text(text="This is a line of text.\nThis is another line of text.")]