TODO:TEST! s3:modules: add vfs_io_uring module
authorStefan Metzmacher <metze@samba.org>
Wed, 5 Jun 2019 15:01:49 +0000 (17:01 +0200)
committerStefan Metzmacher <metze@samba.org>
Tue, 15 Oct 2019 07:36:22 +0000 (09:36 +0200)
The module makes use of the new io_uring infrastructure
(intruduced in linux 5.1), see https://lwn.net/Articles/778411/ and
http://git.kernel.dk/cgit/liburing/

TODO: this module compiles, but was not runtime tested yet.

docs-xml/manpages/vfs_io_uring.8.xml [new file with mode: 0644]
docs-xml/wscript_build
source3/modules/vfs_io_uring.c [new file with mode: 0644]
source3/modules/wscript_build
source3/wscript

diff --git a/docs-xml/manpages/vfs_io_uring.8.xml b/docs-xml/manpages/vfs_io_uring.8.xml
new file mode 100644 (file)
index 0000000..4c9da37
--- /dev/null
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry PUBLIC "-//Samba-Team//DTD DocBook V4.2-Based Variant V1.0//EN" "http://www.samba.org/samba/DTD/samba-doc">
+<refentry id="vfs_io_uring.8">
+
+<refmeta>
+       <refentrytitle>vfs_io_uring</refentrytitle>
+       <manvolnum>8</manvolnum>
+       <refmiscinfo class="source">Samba</refmiscinfo>
+       <refmiscinfo class="manual">System Administration tools</refmiscinfo>
+       <refmiscinfo class="version">&doc.version;</refmiscinfo>
+</refmeta>
+
+<refnamediv>
+       <refname>vfs_io_uring</refname>
+       <refpurpose>Implement async io in Samba vfs using io_uring of Linux (>= 5.1).</refpurpose>
+</refnamediv>
+
+<refsynopsisdiv>
+       <cmdsynopsis>
+               <command>vfs objects = io_uring</command>
+       </cmdsynopsis>
+</refsynopsisdiv>
+
+<refsect1>
+       <title>DESCRIPTION</title>
+
+       <para>This VFS module is part of the
+       <citerefentry><refentrytitle>samba</refentrytitle>
+       <manvolnum>7</manvolnum></citerefentry> suite.</para>
+
+       <para>The <command>io_uring</command> VFS module enables asynchronous
+       pread, pwrite and fsync using the io_uring infrastructure of Linux (>= 5.1).
+       This provides much less overhead compared to the usage of the pthreadpool for
+       async io.</para>
+
+       <para>This module SHOULD be listed last in any module stack as
+       it requires real kernel file descriptors.</para>
+
+</refsect1>
+
+
+<refsect1>
+       <title>EXAMPLES</title>
+
+       <para>Straight forward use:</para>
+
+<programlisting>
+        <smbconfsection name="[cooldata]"/>
+       <smbconfoption name="path">/data/ice</smbconfoption>
+       <smbconfoption name="vfs objects">io_uring</smbconfoption>
+</programlisting>
+
+</refsect1>
+
+<refsect1>
+       <title>OPTIONS</title>
+
+       <variablelist>
+
+               <varlistentry>
+               <term>io_uring:num_entries = NUMBER_OF_QUEUE_ENTRIES</term>
+               <listitem>
+               <para>The number of entries in the submission queue.
+               The maximum allowed value is 4096 and the kernel will roundup the value to a power of 2.
+               </para>
+               <para>The default is '128'.</para>
+               </listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>io_uring:sqpoll = BOOL</term>
+               <listitem>
+               <para>Use the IORING_SETUP_SQPOLL feature.
+               </para>
+               <para>The default is 'no'.</para>
+               </listitem>
+               </varlistentry>
+
+       </variablelist>
+</refsect1>
+
+<refsect1>
+       <title>SEE ALSO</title>
+       <para>
+       <citerefentry><refentrytitle>io_uring_setup</refentrytitle><manvolnum>2</manvolnum></citerefentry>.
+       </para>
+</refsect1>
+
+<refsect1>
+       <title>VERSION</title>
+
+       <para>This man page is part of version &doc.version; of the Samba suite.
+       </para>
+</refsect1>
+
+<refsect1>
+       <title>AUTHOR</title>
+
+       <para>The original Samba software and related utilities
+       were created by Andrew Tridgell. Samba is now developed
+       by the Samba Team as an Open Source project similar
+       to the way the Linux kernel is developed.</para>
+
+</refsect1>
+
+</refentry>
index 70f5b43dd3329d9b124961c5a2c4ed41b5ba1a4c..21158e11fe7bda76f98334201b28f4e211bf3ee8 100644 (file)
@@ -68,6 +68,7 @@ vfs_module_manpages = ['vfs_acl_tdb',
                        'vfs_acl_xattr',
                        'vfs_aio_fork',
                        'vfs_aio_pthread',
+                       'vfs_io_uring',
                        'vfs_audit',
                        'vfs_btrfs',
                        'vfs_cacheprime',
diff --git a/source3/modules/vfs_io_uring.c b/source3/modules/vfs_io_uring.c
new file mode 100644 (file)
index 0000000..61393d6
--- /dev/null
@@ -0,0 +1,468 @@
+/*
+ * Use the io_uring of Linux (>= 5.1)
+ *
+ * Copyright (C) Volker Lendecke 2008
+ * Copyright (C) Jeremy Allison 2010
+ * Copyright (C) Stefan Metzmacher 2019
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "smbd/smbd.h"
+#include "smbd/globals.h"
+#include "lib/util/tevent_unix.h"
+#include "smbprofile.h"
+#include <liburing.h>
+
+struct vfs_io_uring_request;
+
+struct vfs_io_uring_config {
+       struct io_uring uring;
+       struct tevent_fd *fde;
+       struct vfs_io_uring_request *queue;
+       struct vfs_io_uring_request *pending;
+};
+
+struct vfs_io_uring_request {
+       struct vfs_io_uring_request *prev, *next;
+       struct tevent_req *req;
+       void *state;
+       struct io_uring_sqe sqe;
+       struct io_uring_cqe cqe;
+       struct timespec start_time;
+       struct timespec end_time;
+       SMBPROFILE_BYTES_ASYNC_STATE(profile_bytes);
+};
+
+static int vfs_io_uring_config_destructor(struct vfs_io_uring_config *config)
+{
+       // TODO: DLIST_REMOVE loops
+       TALLOC_FREE(config->fde);
+       io_uring_queue_exit(&config->uring);
+       return 0;
+}
+
+static int vfs_io_uring_request_state_deny_destructor(void *state)
+{
+       return -1;
+}
+
+static void vfs_io_uring_fd_handler(struct tevent_context *ev,
+                                   struct tevent_fd *fde,
+                                   uint16_t flags,
+                                   void *private_data);
+
+static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
+                           const char *user)
+{
+       int ret;
+       struct vfs_io_uring_config *config;
+       unsigned num_entries;
+       bool sqpoll;
+       unsigned flags = 0;
+
+       config = talloc_zero(handle->conn, struct vfs_io_uring_config);
+       if (config == NULL) {
+               DEBUG(0, ("talloc_zero() failed\n"));
+               return -1;
+       }
+
+       SMB_VFS_HANDLE_SET_DATA(handle, config,
+                               NULL, struct vfs_io_uring_config,
+                               return -1);
+
+       ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
+       if (ret < 0) {
+               return ret;
+       }
+
+       num_entries = lp_parm_ulong(SNUM(handle->conn),
+                                   "vfs_io_uring",
+                                   "num_entries",
+                                   128);
+       if (num_entries == 0) {
+               num_entries = 1;
+       }
+
+       sqpoll = lp_parm_bool(SNUM(handle->conn),
+                            "vfs_io_uring",
+                            "sqpoll",
+                            false);
+       if (sqpoll) {
+               flags |= IORING_SETUP_SQPOLL;
+       }
+
+       ret = io_uring_queue_init(num_entries, &config->uring, flags);
+       if (ret < 0) {
+               SMB_VFS_NEXT_DISCONNECT(handle);
+               errno = -ret;
+               return -1;
+       }
+
+       talloc_set_destructor(config, vfs_io_uring_config_destructor);
+
+       config->fde = tevent_add_fd(handle->conn->sconn->ev_ctx,
+                                   config,
+                                   config->uring.ring_fd,
+                                   TEVENT_FD_READ,
+                                   vfs_io_uring_fd_handler,
+                                   handle);
+       if (config->fde == NULL) {
+               ret = errno;
+               SMB_VFS_NEXT_DISCONNECT(handle);
+               errno = ret;
+               return -1;
+       }
+
+       return 0;
+}
+
+static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
+{
+       struct vfs_io_uring_request *cur = NULL, *next = NULL;
+       struct io_uring_cqe *cqe = NULL;
+       unsigned cqhead;
+       unsigned nr = 0;
+       struct timespec start_time;
+       struct timespec end_time;
+       int ret;
+
+       PROFILE_TIMESTAMP(&start_time);
+
+       for (cur = config->queue; cur != NULL; cur = next) {
+               struct io_uring_sqe *sqe = NULL;
+
+               next = cur->next;
+
+               sqe = io_uring_get_sqe(&config->uring);
+               if (sqe == NULL) {
+                       break;
+               }
+
+               talloc_set_destructor(cur->state,
+                       vfs_io_uring_request_state_deny_destructor);
+               DLIST_REMOVE(config->queue, cur);
+               *sqe = cur->sqe;
+               DLIST_ADD_END(config->pending, cur);
+               SMBPROFILE_BYTES_ASYNC_SET_BUSY(cur->profile_bytes);
+
+               cur->start_time = start_time;
+       }
+
+       ret = io_uring_submit(&config->uring);
+       if (ret < 0) {
+               // TODO
+       }
+
+       PROFILE_TIMESTAMP(&end_time);
+
+       io_uring_for_each_cqe(&config->uring, cqhead, cqe) {
+               struct tevent_req *req = NULL;
+
+               cur = (struct vfs_io_uring_request *)(uintptr_t)cqe->user_data;
+               req = talloc_get_type_abort(cur->req, struct tevent_req);
+
+               talloc_set_destructor(cur->state, NULL);
+               DLIST_REMOVE(config->pending, cur);
+               cur->cqe = *cqe;
+
+               SMBPROFILE_BYTES_ASYNC_SET_IDLE(cur->profile_bytes);
+               cur->end_time = end_time;
+
+               /*
+                * We rely on tevent_req_defer_callback() being called
+                * already.
+                */
+               tevent_req_done(req);
+               nr++;
+       }
+
+       io_uring_cq_advance(&config->uring, nr);
+}
+
+static void vfs_io_uring_fd_handler(struct tevent_context *ev,
+                                   struct tevent_fd *fde,
+                                   uint16_t flags,
+                                   void *private_data)
+{
+       vfs_handle_struct *handle = (vfs_handle_struct *)private_data;
+       struct vfs_io_uring_config *config = NULL;
+
+       SMB_VFS_HANDLE_GET_DATA(handle, config,
+                               struct vfs_io_uring_config,
+                               smb_panic(__location__));
+
+       vfs_io_uring_queue_run(config);
+}
+
+struct vfs_io_uring_pread_state {
+       struct vfs_io_uring_request ur;
+       struct iovec iov;
+};
+
+static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *handle,
+                                            TALLOC_CTX *mem_ctx,
+                                            struct tevent_context *ev,
+                                            struct files_struct *fsp,
+                                            void *data,
+                                            size_t n, off_t offset)
+{
+       struct tevent_req *req = NULL;
+       struct vfs_io_uring_pread_state *state = NULL;
+       struct vfs_io_uring_config *config = NULL;
+
+       SMB_VFS_HANDLE_GET_DATA(handle, config,
+                               struct vfs_io_uring_config,
+                               smb_panic(__location__));
+
+       req = tevent_req_create(mem_ctx, &state,
+                               struct vfs_io_uring_pread_state);
+       if (req == NULL) {
+               return NULL;
+       }
+       state->ur.req = req;
+       state->ur.state = state;
+
+       SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
+                                    state->ur.profile_bytes, 0);
+       SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
+
+       state->iov.iov_base = (void *)data;
+       state->iov.iov_len = n;
+       state->ur.sqe = (struct io_uring_sqe) {
+               .user_data = (uintptr_t)&state->ur,
+               .opcode = IORING_OP_READV,
+               .flags = 0,
+               .ioprio = 0,
+               .fd = fsp->fh->fd,
+               .off = offset,
+               .addr = (uintptr_t)&state->iov,
+               .len = 1,
+               .rw_flags = 0,
+       };
+       DLIST_ADD_END(config->queue, &state->ur);
+
+       vfs_io_uring_queue_run(config);
+
+       if (!tevent_req_is_in_progress(req)) {
+               return tevent_req_post(req, ev);
+       }
+
+       tevent_req_defer_callback(req, ev);
+       return req;
+}
+
+static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
+                                 struct vfs_aio_state *vfs_aio_state)
+{
+       struct vfs_io_uring_pread_state *state = tevent_req_data(
+               req, struct vfs_io_uring_pread_state);
+       int ret;
+
+       vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
+                                                &state->ur.start_time);
+
+       if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
+               return -1;
+       }
+
+       if (state->ur.cqe.res < 0) {
+               vfs_aio_state->error = -state->ur.cqe.res;
+               ret = -1;
+       } else {
+               vfs_aio_state->error = 0;
+               ret = state->ur.cqe.res;
+       }
+
+       tevent_req_received(req);
+       return ret;
+}
+
+struct vfs_io_uring_pwrite_state {
+       struct vfs_io_uring_request ur;
+       struct iovec iov;
+};
+
+static struct tevent_req *vfs_io_uring_pwrite_send(struct vfs_handle_struct *handle,
+                                             TALLOC_CTX *mem_ctx,
+                                             struct tevent_context *ev,
+                                             struct files_struct *fsp,
+                                             const void *data,
+                                             size_t n, off_t offset)
+{
+       struct tevent_req *req = NULL;
+       struct vfs_io_uring_pwrite_state *state = NULL;
+       struct vfs_io_uring_config *config = NULL;
+
+       SMB_VFS_HANDLE_GET_DATA(handle, config,
+                               struct vfs_io_uring_config,
+                               smb_panic(__location__));
+
+       req = tevent_req_create(mem_ctx, &state,
+                               struct vfs_io_uring_pwrite_state);
+       if (req == NULL) {
+               return NULL;
+       }
+       state->ur.req = req;
+       state->ur.state = state;
+
+       SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
+                                    state->ur.profile_bytes, 0);
+       SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
+
+       state->iov.iov_base = discard_const(data);
+       state->iov.iov_len = n;
+       state->ur.sqe = (struct io_uring_sqe) {
+               .user_data = (uintptr_t)&state->ur,
+               .opcode = IORING_OP_WRITEV,
+               .flags = 0,
+               .ioprio = 0,
+               .fd = fsp->fh->fd,
+               .off = offset,
+               .addr = (uintptr_t)&state->iov,
+               .len = 1,
+               .rw_flags = 0,
+       };
+       DLIST_ADD_END(config->queue, &state->ur);
+
+       vfs_io_uring_queue_run(config);
+
+       if (!tevent_req_is_in_progress(req)) {
+               return tevent_req_post(req, ev);
+       }
+
+       tevent_req_defer_callback(req, ev);
+       return req;
+}
+
+static ssize_t vfs_io_uring_pwrite_recv(struct tevent_req *req,
+                                  struct vfs_aio_state *vfs_aio_state)
+{
+       struct vfs_io_uring_pwrite_state *state = tevent_req_data(
+               req, struct vfs_io_uring_pwrite_state);
+       int ret;
+
+       vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
+                                                &state->ur.start_time);
+
+       if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
+               return -1;
+       }
+
+       if (state->ur.cqe.res < 0) {
+               vfs_aio_state->error = -state->ur.cqe.res;
+               ret = -1;
+       } else {
+               vfs_aio_state->error = 0;
+               ret = state->ur.cqe.res;
+       }
+
+       tevent_req_received(req);
+       return ret;
+}
+
+struct vfs_io_uring_fsync_state {
+       struct vfs_io_uring_request ur;
+};
+
+static struct tevent_req *vfs_io_uring_fsync_send(struct vfs_handle_struct *handle,
+                                            TALLOC_CTX *mem_ctx,
+                                            struct tevent_context *ev,
+                                            struct files_struct *fsp)
+{
+       struct tevent_req *req = NULL;
+       struct vfs_io_uring_fsync_state *state = NULL;
+       struct vfs_io_uring_config *config = NULL;
+
+       SMB_VFS_HANDLE_GET_DATA(handle, config,
+                               struct vfs_io_uring_config,
+                               smb_panic(__location__));
+
+       req = tevent_req_create(mem_ctx, &state,
+                               struct vfs_io_uring_fsync_state);
+       if (req == NULL) {
+               return NULL;
+       }
+       state->ur.req = req;
+       state->ur.state = state;
+
+       SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
+                                    state->ur.profile_bytes, 0);
+       SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
+
+       state->ur.sqe = (struct io_uring_sqe) {
+               .user_data = (uintptr_t)&state->ur,
+               .opcode = IORING_OP_FSYNC,
+               .flags = 0,
+               .ioprio = 0,
+               .fd = fsp->fh->fd,
+               .fsync_flags = 0,
+       };
+       DLIST_ADD_END(config->queue, &state->ur);
+
+       vfs_io_uring_queue_run(config);
+
+       if (!tevent_req_is_in_progress(req)) {
+               return tevent_req_post(req, ev);
+       }
+
+       tevent_req_defer_callback(req, ev);
+       return req;
+}
+
+static int vfs_io_uring_fsync_recv(struct tevent_req *req,
+                             struct vfs_aio_state *vfs_aio_state)
+{
+       struct vfs_io_uring_fsync_state *state = tevent_req_data(
+               req, struct vfs_io_uring_fsync_state);
+       int ret;
+
+       vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
+                                                &state->ur.start_time);
+
+       if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
+               return -1;
+       }
+
+       if (state->ur.cqe.res < 0) {
+               vfs_aio_state->error = -state->ur.cqe.res;
+               ret = -1;
+       } else {
+               vfs_aio_state->error = 0;
+               ret = state->ur.cqe.res;
+       }
+
+       tevent_req_received(req);
+       return ret;
+}
+
+static struct vfs_fn_pointers vfs_io_uring_fns = {
+       .connect_fn = vfs_io_uring_connect,
+       .pread_send_fn = vfs_io_uring_pread_send,
+       .pread_recv_fn = vfs_io_uring_pread_recv,
+       .pwrite_send_fn = vfs_io_uring_pwrite_send,
+       .pwrite_recv_fn = vfs_io_uring_pwrite_recv,
+       .fsync_send_fn = vfs_io_uring_fsync_send,
+       .fsync_recv_fn = vfs_io_uring_fsync_recv,
+};
+
+static_decl_vfs;
+NTSTATUS vfs_io_uring_init(TALLOC_CTX *ctx)
+{
+       return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
+                               "io_uring", &vfs_io_uring_fns);
+}
index e4f92fac150867f3410a43d7692409340d92421b..fd66a5387b2cba47c9f0a5968666a38e21eaedf7 100644 (file)
@@ -396,6 +396,14 @@ bld.SAMBA3_MODULE('vfs_aio_pthread',
                  internal_module=bld.SAMBA3_IS_STATIC_MODULE('vfs_aio_pthread'),
                  enabled=bld.SAMBA3_IS_ENABLED_MODULE('vfs_aio_pthread'))
 
+bld.SAMBA3_MODULE('vfs_io_uring',
+                 subsystem='vfs',
+                 source='vfs_io_uring.c',
+                 deps='samba-util tevent uring',
+                 init_function='',
+                 internal_module=bld.SAMBA3_IS_STATIC_MODULE('vfs_io_uring'),
+                 enabled=bld.SAMBA3_IS_ENABLED_MODULE('vfs_io_uring'))
+
 bld.SAMBA3_MODULE('vfs_preopen',
                  subsystem='vfs',
                  source='vfs_preopen.c',
index 10160fd373cc92dff87a75458b65320ff3395ce5..0efe4ca24765670ef53b06bcbd0f69531897edf5 100644 (file)
@@ -1756,6 +1756,12 @@ main() {
                                       and conf.CHECK_LIB('dbus-1', shlib=True)):
             conf.DEFINE('HAVE_DBUS', '1')
 
+    if conf.CHECK_CFG(package='liburing', args='--cflags --libs',
+                      msg='Checking for liburing package', uselib_store="URING"):
+        if (conf.CHECK_HEADERS('liburing.h', lib='uring')
+                                      and conf.CHECK_LIB('uring', shlib=True)):
+            conf.DEFINE('HAVE_LIBURING', '1')
+
     conf.env.build_regedit = False
     if not Options.options.with_regedit == False:
         conf.PROCESS_SEPARATE_RULE('system_ncurses')
@@ -1934,6 +1940,9 @@ main() {
     if (conf.CONFIG_SET('HAVE_STRUCT_MSGHDR_MSG_CONTROL') or conf.CONFIG_SET('HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTS')):
         default_shared_modules.extend(TO_LIST('vfs_aio_fork'))
 
+    if conf.CONFIG_SET('HAVE_LIBURING'):
+        default_shared_modules.extend(TO_LIST('vfs_io_uring'))
+
     if Options.options.with_pthreadpool:
         default_shared_modules.extend(TO_LIST('vfs_aio_pthread'))