lib/compression: add simple python bindings

author Douglas Bagnall <douglas.bagnall@catalyst.net.nz>

Fri, 25 Nov 2022 03:43:52 +0000 (16:43 +1300)

committer Jeremy Allison <jra@samba.org>

Thu, 22 Dec 2022 19:50:33 +0000 (19:50 +0000)
author Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Fri, 25 Nov 2022 03:43:52 +0000 (16:43 +1300)
committer Jeremy Allison <jra@samba.org>
Thu, 22 Dec 2022 19:50:33 +0000 (19:50 +0000)
diff --git a/lib/compression/pycompression.c b/lib/compression/pycompression.c

new file mode 100644 (file)

index 0000000..00a2070
--- /dev/null
+++ b/lib/compression/pycompression.c
@@ -0,0 +1,304 @@
+/*
+   Samba Unix SMB/CIFS implementation.
+
+   Python bindings for compression functions.
+
+   Copyright (C) Petr Viktorin 2015
+   Copyright (C) Douglas Bagnall 2022
+
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include <talloc.h>
+#include <Python.h>
+#include "lzxpress.h"
+#include "lzxpress_huffman.h"
+
+/* CompressionError is filled out in module init */
+static PyObject *CompressionError = NULL;
+
+static PyObject *plain_compress(PyObject *mod, PyObject *args)
+{
+       uint8_t *src = NULL;
+       Py_ssize_t src_len;
+       char *dest = NULL;
+       Py_ssize_t dest_len;
+       PyObject *dest_obj = NULL;
+       size_t alloc_len;
+       int ret;
+
+       if (!PyArg_ParseTuple(args, "s#", &src, &src_len)) {
+               return NULL;
+       }
+
+       /*
+        * 9/8 + 4 is the worst case growth, but we add room.
+        *
+        * alloc_len can't overflow as src_len is ssize_t while alloc_len is
+        * size_t.
+        */
+       alloc_len = src_len + src_len / 8 + 500;
+
+       dest_obj = PyBytes_FromStringAndSize(NULL, alloc_len);
+       if (dest_obj == NULL) {
+               return NULL;
+       }
+       dest = PyBytes_AS_STRING(dest_obj);
+
+       dest_len = lzxpress_compress(src,
+                                    src_len,
+                                    (uint8_t *)dest,
+                                    alloc_len);
+       if (dest_len < 0) {
+               PyErr_SetString(CompressionError, "unable to compress data");
+               Py_DECREF(dest_obj);
+               return NULL;
+       }
+
+       ret = _PyBytes_Resize(&dest_obj, dest_len);
+       if (ret != 0) {
+               /*
+                * Don't try to free dest_obj, as we're in deep MemoryError
+                * territory here.
+                */
+               return NULL;
+       }
+       return dest_obj;
+}
+
+
+static PyObject *plain_decompress(PyObject *mod, PyObject *args)
+{
+       uint8_t *src = NULL;
+       Py_ssize_t src_len;
+       char *dest = NULL;
+       Py_ssize_t dest_len;
+       PyObject *dest_obj = NULL;
+       Py_ssize_t alloc_len = 0;
+       Py_ssize_t given_len = 0;
+       int ret;
+
+       if (!PyArg_ParseTuple(args, "s#|n", &src, &src_len, &given_len)) {
+               return NULL;
+       }
+       if (given_len != 0) {
+               /*
+                * With plain decompression, we don't *need* the exact output
+                * size (as we do with LZ77+Huffman), but it certainly helps
+                * when guessing the size.
+                */
+               alloc_len = given_len;
+       } else if (src_len > UINT32_MAX) {
+               /*
+                * The underlying decompress function will reject this, but by
+                * checking here we can give a better message and be clearer
+                * about overflow risks.
+                *
+                * Note, the limit is actually the smallest of UINT32_MAX and
+                * SSIZE_MAX, but src_len is ssize_t so it already can't
+                * exceed that.
+                */
+               PyErr_Format(CompressionError,
+                            "The maximum size for compressed data is 4GB "
+                            "cannot decompress %zu bytes.", src_len);
+       } else {
+               /*
+                * The data can expand massively (though not beyond the
+                * 4GB limit) so we guess a big number for small inputs
+                * (we expect small inputs), and a relatively conservative
+                * number for big inputs.
+                */
+               if (src_len <= 3333333) {
+                       alloc_len = 10000000;
+               } else if (src_len / 3 >= UINT32_MAX) {
+                       alloc_len = UINT32_MAX;
+               } else {
+                       alloc_len = src_len * 3;
+               }
+       }
+
+       dest_obj = PyBytes_FromStringAndSize(NULL, alloc_len);
+       if (dest_obj == NULL) {
+               return NULL;
+       }
+       dest = PyBytes_AS_STRING(dest_obj);
+
+       dest_len = lzxpress_decompress(src,
+                                      src_len,
+                                      (uint8_t *)dest,
+                                      alloc_len);
+       if (dest_len < 0) {
+               if (alloc_len == given_len) {
+                       PyErr_Format(CompressionError,
+                                    "unable to decompress data into a buffer "
+                                    "of %zd bytes.", alloc_len);
+               } else {
+                       PyErr_Format(CompressionError,
+                                    "unable to decompress data into a buffer "
+                                    "of %zd bytes. If you know the length, "
+                                    "supply it as the second argument.",
+                                    alloc_len);
+               }
+               Py_DECREF(dest_obj);
+               return NULL;
+       }
+
+       ret = _PyBytes_Resize(&dest_obj, dest_len);
+       if (ret != 0) {
+               /*
+                * Don't try to free dest_obj, as we're in deep MemoryError
+                * territory here.
+                */
+               return NULL;
+       }
+       return dest_obj;
+}
+
+
+
+static PyObject *huffman_compress(PyObject *mod, PyObject *args)
+{
+       uint8_t *src = NULL;
+       Py_ssize_t src_len;
+       char *dest = NULL;
+       Py_ssize_t dest_len;
+       PyObject *dest_obj = NULL;
+       size_t alloc_len;
+       int ret;
+       struct lzxhuff_compressor_mem cmp_mem;
+
+       if (!PyArg_ParseTuple(args, "s#", &src, &src_len)) {
+               return NULL;
+       }
+       /*
+        * worst case is roughly 256 per 64k or less.
+        *
+        * alloc_len won't overflow as src_len is ssize_t while alloc_len is
+        * size_t.
+        */
+       alloc_len = src_len + src_len / 8 + 500;
+
+       dest_obj = PyBytes_FromStringAndSize(NULL, alloc_len);
+       if (dest_obj == NULL) {
+               return NULL;
+       }
+       dest = PyBytes_AS_STRING(dest_obj);
+
+       dest_len = lzxpress_huffman_compress(&cmp_mem,
+                                            src,
+                                            src_len,
+                                            (uint8_t *)dest,
+                                            alloc_len);
+       if (dest_len < 0) {
+               PyErr_SetString(CompressionError, "unable to compress data");
+               Py_DECREF(dest_obj);
+               return NULL;
+       }
+
+       ret = _PyBytes_Resize(&dest_obj, dest_len);
+       if (ret != 0) {
+               return NULL;
+       }
+       return dest_obj;
+}
+
+
+static PyObject *huffman_decompress(PyObject *mod, PyObject *args)
+{
+       uint8_t *src = NULL;
+       Py_ssize_t src_len;
+       char *dest = NULL;
+       Py_ssize_t dest_len;
+       PyObject *dest_obj = NULL;
+       Py_ssize_t given_len = 0;
+       /*
+        * Here it is always necessary to supply the exact length.
+        */
+
+       if (!PyArg_ParseTuple(args, "s#n", &src, &src_len, &given_len)) {
+               return NULL;
+       }
+
+       dest_obj = PyBytes_FromStringAndSize(NULL, given_len);
+       if (dest_obj == NULL) {
+               return NULL;
+       }
+       dest = PyBytes_AS_STRING(dest_obj);
+
+       dest_len = lzxpress_huffman_decompress(src,
+                                              src_len,
+                                              (uint8_t *)dest,
+                                              given_len);
+       if (dest_len != given_len) {
+               PyErr_Format(CompressionError,
+                            "unable to decompress data into a %zd bytes.",
+                            given_len);
+               Py_DECREF(dest_obj);
+               return NULL;
+       }
+       /* no resize here */
+       return dest_obj;
+}
+
+
+static PyMethodDef mod_methods[] = {
+       { "plain_compress", (PyCFunction)plain_compress, METH_VARARGS,
+               "compress bytes using lzxpress plain compression"},
+       { "plain_decompress", (PyCFunction)plain_decompress, METH_VARARGS,
+               "decompress lzxpress plain compressed bytes"},
+       { "huffman_compress", (PyCFunction)huffman_compress, METH_VARARGS,
+               "compress bytes using lzxpress plain compression"},
+       { "huffman_decompress", (PyCFunction)huffman_decompress, METH_VARARGS,
+               "decompress lzxpress plain compressed bytes"},
+       {0}
+};
+
+
+#define MODULE_DOC PyDoc_STR("LZXpress compresssion/decompression bindings")
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    .m_name = "compression",
+    .m_doc = MODULE_DOC,
+    .m_size = -1,
+    .m_methods = mod_methods,
+};
+
+
+static PyObject *module_init(void)
+{
+       PyObject *m = PyModule_Create(&moduledef);
+       if (m == NULL) {
+               return NULL;
+       }
+
+       CompressionError = PyErr_NewException(
+               "compression.CompressionError",
+               PyExc_Exception,
+               NULL);
+       PyModule_AddObject(m, "CompressionError", CompressionError);
+
+       return m;
+}
+
+PyMODINIT_FUNC PyInit_compression(void);
+PyMODINIT_FUNC PyInit_compression(void)
+{
+       return module_init();
+}
diff --git a/lib/compression/wscript_build b/lib/compression/wscript_build

index 1ab208cf18d951c39fd487447171fea26cc39582..61fe4a9808e66fd38d050d363faa14c54f4e5583 100644 (file)
--- a/lib/compression/wscript_build
+++ b/lib/compression/wscript_build
@@ -18,3 +18,8 @@ bld.SAMBA_BINARY('test_lzxpress_plain',
                         ' samba-util'),
                   local_include=False,
                   for_selftest=True)
+
+bld.SAMBA_PYTHON('pycompression',
+                 'pycompression.c',
+                 deps='LZXPRESS',
+                 realname='samba/compression.so')
diff --git a/python/samba/tests/compression.py b/python/samba/tests/compression.py

new file mode 100644 (file)

index 0000000..48f8c87
--- /dev/null
+++ b/python/samba/tests/compression.py
@@ -0,0 +1,212 @@
+# Unix SMB/CIFS implementation.
+# Copyright © Catalyst
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from unittest import TestSuite
+import os
+import random
+
+from samba.tests import TestCase
+from samba import compression
+
+
+TEST_DIR = "testdata/compression"
+
+
+class BaseCompressionTest(TestCase):
+    def round_trip(self, data, size_delta=0):
+        """Compress, decompress, assert equality with original.
+
+        If size_delta is None, no size is given to decompress. This
+        should fail with the Huffman varient and succeed with plain.
+        Otherwise size_delta is added to the gven size; if negative,
+        we'd expect a failure, with plain compression a positive delta
+        will succeed.
+        """
+
+        compressed = self.compress(data)
+        if size_delta is None:
+            decompressed = self.decompress(compressed)
+        else:
+            decomp_size = len(data) + size_delta
+            decompressed = self.decompress(compressed, decomp_size)
+
+        if isinstance(data, str):
+            data = data.encode()
+
+        self.assertEqual(data, decompressed)
+        return compressed
+
+    def decompress_file(self, fn):
+        decomp_fn = os.path.join(TEST_DIR,
+                                 "decompressed",
+                                 fn + ".decomp")
+        comp_fn = os.path.join(TEST_DIR,
+                               self.compressed_dir,
+                               fn + self.compressed_suffix)
+
+        with open(decomp_fn, 'rb') as f:
+            decomp_expected = f.read()
+        with open(comp_fn, 'rb') as f:
+            comp = f.read()
+
+        decompressed = self.decompress(comp, len(decomp_expected))
+
+        self.assertEqual(decomp_expected, decompressed)
+
+
+class LzxpressPlainCompressionTest(BaseCompressionTest):
+    compress = compression.plain_compress
+    decompress = compression.plain_decompress
+    compressed_dir = "compressed-plain"
+    compressed_suffix = ".lzplain"
+
+    def test_round_trip_aaa_str(self):
+        s = 'a' * 150000
+        self.round_trip(s)
+
+    def test_round_trip_aaa_bytes(self):
+        s = b'a' * 150000
+        self.round_trip(s)
+
+    def test_round_trip_aaa_short(self):
+        s = b'a' * 150000
+
+        # this'll fail because the match for 'aaa...' will run
+        # past the end of the buffer
+        self.assertRaises(compression.CompressionError,
+                          self.round_trip, s, -1)
+
+    def test_round_trip_aaa_long(self):
+        s = b'a' * 150000
+        # this *wont* fail because although the data will run out
+        # before the buffer is full, LZXpress plain does not care
+        # about that.
+        try:
+            self.round_trip(s, 1)
+        except compression.CompressionError as e:
+            self.fail(f"failed to decompress with {e}")
+
+    def test_round_trip_aaab_short(self):
+        s = b'a' * 150000 + b'b'
+
+        # this will *partially* succeed, because the buffer will fill
+        # up vat a break in the decompression (not mid-match), and
+        # lzxpress plain does not mind that. However self.round_trip
+        # also makes an assertion that the original data equals the
+        # decompressed result, and it won't because the decompressed
+        # result is one byte shorter.
+        self.assertRaises(AssertionError,
+                          self.round_trip, s, -1)
+
+    def test_round_trip_aaab_unstated(self):
+        s = b'a' * 150000 + b'b'
+
+        # this will succeed, because with no target size given, we
+        # guess a large buffer in the python bindings.
+        try:
+            self.round_trip(s)
+        except compression.CompressionError as e:
+            self.fail(f"failed to decompress with {e}")
+
+    def test_round_trip_30mb(self):
+        s = b'abc' * 10000000
+        # This will decompress into a string bigger than the python
+        # bindings are willing to speculatively allocate, so will fail
+        # to decompress.
+        with self.assertRaises(compression.CompressionError):
+            self.round_trip(s, None)
+
+        # but it will be fine if we use the length
+        try:
+            self.round_trip(s, 0)
+        except compression.CompressionError as e:
+            self.fail(f"failed to decompress with {e}")
+
+    def test_files(self):
+        # We don't go through the whole set, which are already tested
+        # by lib/compression/tests/test_lzxpress_plain.c
+        for fn in ("slow-33d90a24e70515b14cd0",
+                   "midsummer-nights-dream.txt"):
+            self.decompress_file(fn)
+
+    def test_empty_round_trip(self):
+        # not symmetrical with Huffman, this doesn't fail
+        self.round_trip('')
+
+
+class LzxpressHuffmanCompressionTest(BaseCompressionTest):
+    compress = compression.huffman_compress
+    decompress = compression.huffman_decompress
+    compressed_dir = "compressed-huffman"
+    compressed_suffix = ".lzhuff"
+
+    def test_round_trip_aaa_str(self):
+        s = 'a' * 150000
+        self.round_trip(s)
+
+    def test_round_trip_aaa_bytes(self):
+        s = b'a' * 150000
+        self.round_trip(s)
+
+    def test_round_trip_aaa_short(self):
+        s = b'a' * 150000
+
+        # this'll fail because the match for 'aaa...' will run
+        # past the end of the buffer
+        self.assertRaises(compression.CompressionError,
+                          self.round_trip, s, -1)
+
+    def test_round_trip_aaa_long(self):
+        s = b'a' * 150000
+
+        # this'll fail because the data will run out before the buffer
+        # is full.
+        self.assertRaises(compression.CompressionError,
+                          self.round_trip, s, 1)
+
+    def test_round_trip_aaab_short(self):
+        s = b'a' * 150000 + b'b'
+
+        # this *could* be allowed to succeed, because even though we
+        # give it the wrong size, we know the decompression will not
+        # flow over the end of the buffer, The behaviour here appears
+        # to be implementation dependent -- the decompressor has the
+        # option of saying 'whatever' and continuing. We are probably
+        # stricter than Windows.
+        self.assertRaises(compression.CompressionError,
+                          self.round_trip, s, -1)
+
+    def test_round_trip_aaab_unstated(self):
+        s = b'a' * 150000 + b'b'
+
+        # For the Huffman algorithm, the length is really an essential
+        # part of the compression data, and the bindings will reject a
+        # call with out it. This happens at the arument parsing stage,
+        # so is a TypeError (i.e. wrong type of function), not a
+        # CompressionError.
+        self.assertRaises(TypeError,
+                          self.round_trip, s, None)
+
+    def test_files(self):
+        # We don't go through the whole set, which are already tested
+        # by lib/compression/tests/test_lzx_huffman.c
+        for fn in ("slow-33d90a24e70515b14cd0",
+                   "midsummer-nights-dream.txt"):
+            self.decompress_file(fn)
+
+    def test_empty_round_trip(self):
+        with self.assertRaises(compression.CompressionError):
+            self.round_trip('')
diff --git a/source4/selftest/tests.py b/source4/selftest/tests.py

index 3212e296dddd2d236afee96a1a589d66965b8169..774b874edbd412e0b66bef12139539ef172cdee1 100755 (executable)
--- a/source4/selftest/tests.py
+++ b/source4/selftest/tests.py
@@ -1902,3 +1902,4 @@ planoldpythontestsuite("proclimitdc",
  
  planoldpythontestsuite("none", "samba.tests.usage")
  planpythontestsuite("fileserver", "samba.tests.dcerpc.mdssvc")
+planoldpythontestsuite("none", "samba.tests.compression")
author	Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
	Fri, 25 Nov 2022 03:43:52 +0000 (16:43 +1300)
committer	Jeremy Allison <jra@samba.org>
	Thu, 22 Dec 2022 19:50:33 +0000 (19:50 +0000)
lib/compression/pycompression.c	[new file with mode: 0644]	patch \| blob
lib/compression/wscript_build		patch \| blob \| history
python/samba/tests/compression.py	[new file with mode: 0644]	patch \| blob
source4/selftest/tests.py		patch \| blob \| history