diff --git a/CHANGELOG b/CHANGELOG index 6be48c6..564b09c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ v0.4 (unreleased): -- +- Added a script to test for memory leaks in scripts/memtest.py. v0.3.3 (released April 22, 2014): diff --git a/docs/changelog.rst b/docs/changelog.rst index 3f2ba0e..5a59be0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,7 +7,7 @@ v0.4 Unreleased (`changes `__): -- +- Added a script to test for memory leaks in :file:`scripts/memtest.py`. v0.3.3 ------ diff --git a/scripts/memtest.py b/scripts/memtest.py new file mode 100644 index 0000000..e6b8011 --- /dev/null +++ b/scripts/memtest.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2014 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +Tests for memory leaks in the CTokenizer. Python 2 and 3 compatible. + +This appears to work mostly fine under Linux, but gives an absurd number of +false positives on OS X. I'm not sure why. Running the tests multiple times +yields different results (tests don't always leak, and the amount they leak by +varies). Increasing the number of loops results in a smaller bytes/loop value, +too, indicating the increase in memory usage might be due to something else. +Actual memory leaks typically leak very large amounts of memory (megabytes) +and scale with the number of loops. +""" + +from __future__ import unicode_literals, print_function +from locale import LC_ALL, setlocale +from multiprocessing import Process, Pipe +from os import listdir, path +import sys + +import psutil + +from mwparserfromhell.compat import py3k +from mwparserfromhell.parser._tokenizer import CTokenizer + +if sys.version_info[0] == 2: + range = xrange + +LOOPS = 10000 + +class Color(object): + GRAY = "\x1b[30;1m" + GREEN = "\x1b[92m" + YELLOW = "\x1b[93m" + RESET = "\x1b[0m" + + +class MemoryTest(object): + """Manages a memory test.""" + + def __init__(self): + self._tests = [] + self._load() + + def _parse_file(self, name, text): + tests = text.split("\n---\n") + counter = 1 + digits = len(str(len(tests))) + for test in tests: + data = {"name": None, "label": None, "input": None, "output": None} + for line in test.strip().splitlines(): + if line.startswith("name:"): + data["name"] = line[len("name:"):].strip() + elif line.startswith("label:"): + data["label"] = line[len("label:"):].strip() + elif line.startswith("input:"): + raw = line[len("input:"):].strip() + if raw[0] == '"' and raw[-1] == '"': + raw = raw[1:-1] + raw = raw.encode("raw_unicode_escape") + data["input"] = raw.decode("unicode_escape") + number = str(counter).zfill(digits) + fname = "test_{0}{1}_{2}".format(name, number, data["name"]) + self._tests.append((fname, data["input"])) + counter += 1 + + def _load(self): + def load_file(filename): + with open(filename, "rU") as fp: + text = fp.read() + if not py3k: + text = text.decode("utf8") + name = path.split(filename)[1][:0-len(extension)] + self._parse_file(name, text) + + root = path.split(path.dirname(path.abspath(__file__)))[0] + directory = path.join(root, "tests", "tokenizer") + extension = ".mwtest" + if len(sys.argv) > 2 and sys.argv[1] == "--use": + for name in sys.argv[2:]: + load_file(path.join(directory, name + extension)) + sys.argv = [sys.argv[0]] # So unittest doesn't try to load these + else: + for filename in listdir(directory): + if not filename.endswith(extension): + continue + load_file(path.join(directory, filename)) + + @staticmethod + def _print_results(info1, info2): + r1, r2 = info1.rss, info2.rss + buff = 8192 + if r2 - buff > r1: + d = r2 - r1 + p = float(d) / r1 + bpt = d // LOOPS + tmpl = "{0}LEAKING{1}: {2:n} bytes, {3:.2%} inc ({4:n} bytes/loop)" + sys.stdout.write(tmpl.format(Color.YELLOW, Color.RESET, d, p, bpt)) + else: + sys.stdout.write("{0}OK{1}".format(Color.GREEN, Color.RESET)) + + def run(self): + """Run the memory test suite.""" + width = 1 + for (name, _) in self._tests: + if len(name) > width: + width = len(name) + + tmpl = "{0}[{1:03}/{2}]{3} {4}: " + for i, (name, text) in enumerate(self._tests, 1): + sys.stdout.write(tmpl.format(Color.GRAY, i, len(self._tests), + Color.RESET, name.ljust(width))) + sys.stdout.flush() + parent, child = Pipe() + p = Process(target=_runner, args=(text, child)) + p.start() + try: + proc = psutil.Process(p.pid) + parent.recv() + parent.send("OK") + parent.recv() + info1 = proc.get_memory_info() + sys.stdout.flush() + parent.send("OK") + parent.recv() + info2 = proc.get_memory_info() + self._print_results(info1, info2) + sys.stdout.flush() + parent.send("OK") + finally: + proc.kill() + print() + + +def _runner(text, child): + r1, r2 = range(250), range(LOOPS) + for i in r1: + CTokenizer().tokenize(text) + child.send("OK") + child.recv() + child.send("OK") + child.recv() + for i in r2: + CTokenizer().tokenize(text) + child.send("OK") + child.recv() + +if __name__ == "__main__": + setlocale(LC_ALL, "") + MemoryTest().run()