From 4636fbeb4a46e76b5d04a9c439758ed042eea7eb Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 3 Feb 2013 02:10:36 -0500
Subject: [PATCH] Built an infrastructure for loading and running tokenizer
 tests.

---
 tests/_test_tokenizer.py  | 74 +++++++++++++++++++++++++++++++++++++++++++----
 tests/test_ctokenizer.py  |  4 +--
 tests/test_pytokenizer.py |  4 +--
 tests/tokenizer/text.test | 11 +++++++
 4 files changed, 84 insertions(+), 9 deletions(-)
 create mode 100644 tests/tokenizer/text.test

diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py
index 29f4e37..1efafd9 100644
--- a/tests/_test_tokenizer.py
+++ b/tests/_test_tokenizer.py
@@ -20,9 +20,73 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-class TestTokenizer():
-    def tokenize(self, text):
-        return self.tokenizer().tokenize(text)
+from __future__ import print_function, unicode_literals
+from os import listdir, path
 
-    def test_basic(self):
-        self.assertEqual(1, 1)
+from mwparserfromhell.parser import tokens
+
+class _TestParseError(Exception):
+    """Raised internally when a test could not be parsed."""
+    pass
+
+
+class TokenizerTestCase(object):
+    @classmethod
+    def _build_test_method(cls, funcname, data):
+        def inner(self):
+            actual = self.tokenizer().tokenize(data["input"])
+            self.assertEqual(actual, data["output"])
+        inner.__name__ = funcname.encode("utf8")
+        inner.__doc__ = data["label"]
+        return inner
+
+    @classmethod
+    def _load_tests(cls, filename, text):
+        tests = text.split("\n---\n")
+        for test in tests:
+            data = {"name": "", "label": "", "input": "", "output": []}
+            try:
+                for line in test.strip().splitlines():
+                    if line.startswith("name:"):
+                        data["name"] = line[len("name:"):].strip()
+                    elif line.startswith("label:"):
+                        data["label"] = line[len("label:"):].strip()
+                    elif line.startswith("input:"):
+                        raw = line[len("input:"):].strip()
+                        if raw[0] == '"' and raw[-1] == '"':
+                            raw = raw[1:-1]
+                        data["input"] = raw.decode("unicode_escape")
+                    elif line.startswith("output:"):
+                        raw = line[len("output:"):].strip()
+                        data["output"] = eval(raw, vars(tokens))
+            except _TestParseError:
+                if data["name"]:
+                    error = "Could not parse test {0} in {1}"
+                    print(error.format(data["name"], filename))
+                else:
+                    print("Could not parse a test in {0}".format(filename))
+                continue
+            if not data["name"]:
+                error = "A test in {0} was ignored because it lacked a name"
+                print(error.format(filename))
+                continue
+            if not data["input"] or not data["output"]:
+                error = "Test {0} in {1} was ignored because it lacked an input or an output"
+                print(error.format(data["name"], filename))
+                continue
+            funcname = "test_" + filename + "_" + data["name"]
+            meth = cls._build_test_method(funcname, data)
+            setattr(cls, funcname, meth)
+
+    @classmethod
+    def build(cls):
+        directory = path.join(path.dirname(__file__), "tokenizer")
+        extension = ".test"
+        for filename in listdir(directory):
+            if not filename.endswith(extension):
+                continue
+            with open(path.join(directory, filename), "r") as fp:
+                text = fp.read().decode("utf8")
+                cls._load_tests(filename[:0-len(extension)], text)
+
+TokenizerTestCase.build()
diff --git a/tests/test_ctokenizer.py b/tests/test_ctokenizer.py
index e5a7aef..7d3ffd7 100644
--- a/tests/test_ctokenizer.py
+++ b/tests/test_ctokenizer.py
@@ -22,9 +22,9 @@
 
 import unittest
 
-from _test_tokenizer import TestTokenizer
+from _test_tokenizer import TokenizerTestCase
 
-class TestCTokenizer(unittest.TestCase, TestTokenizer):
+class TestCTokenizer(TokenizerTestCase, unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         from mwparserfromhell.parser._tokenizer import CTokenizer
diff --git a/tests/test_pytokenizer.py b/tests/test_pytokenizer.py
index 01855f7..f739726 100644
--- a/tests/test_pytokenizer.py
+++ b/tests/test_pytokenizer.py
@@ -22,9 +22,9 @@
 
 import unittest
 
-from _test_tokenizer import TestTokenizer
+from _test_tokenizer import TokenizerTestCase
 
-class TestPyTokenizer(unittest.TestCase, TestTokenizer):
+class TestPyTokenizer(TokenizerTestCase, unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         from mwparserfromhell.parser.tokenizer import Tokenizer
diff --git a/tests/tokenizer/text.test b/tests/tokenizer/text.test
new file mode 100644
index 0000000..8d97412
--- /dev/null
+++ b/tests/tokenizer/text.test
@@ -0,0 +1,11 @@
+name:   basic
+label:  sanity check for basic text parsing, no gimmicks
+input:  "foobar"
+output: [Text(text="foobar")]
+
+---
+
+name:   basic2
+label:  slightly more complex text parsing, with newlines
+input:  "This is a line of text.\nThis is another line of text."
+output: [Text(text="This is a line of text.\nThis is another line of text.")]