Add vfs_aio_fork
authorVolker Lendecke <vl@samba.org>
Thu, 21 Feb 2008 15:14:08 +0000 (16:14 +0100)
committerVolker Lendecke <vl@samba.org>
Fri, 29 Feb 2008 08:47:53 +0000 (09:47 +0100)
This is used for two purposes:

First, I'm using it to test the async I/O code. In the forked process it is
pretty easy to delay a reply for a random amount of time. See the
BUILD_FARM_HACKS snippet.

Second, there are systems around that claim to have Posix AIO but which is
broken. This might be some help for those systems.

Also add tests how to pass file descriptors

WHATSNEW.txt
source/Makefile.in
source/configure.in
source/modules/vfs_aio_fork.c [new file with mode: 0644]

index 8c32d9aaaf238de9e1151df70572b43691196c4b..76a66b3d297910ff484999d6b403514da9be3afd 100644 (file)
@@ -205,6 +205,8 @@ o  Volker Lendecke <vl@samba.org>
     * Support for storing alternate data streams in xattrs.
     * Implement a generic in-memory cache based on rb-trees.
     * Add implicit temporary talloc contexts via talloc_stack().
+    * Speed up the smbclient "get" command
+    * Add the aio_fork module
 
 
 o   Stefan Metzmacher <metze@samba.org>
index 7ab0c4fb37d3658e30f716568c3cb8478c6ebb8a..2f963e295da60f9ca2fb07136aafdcd353b2ae99 100644 (file)
@@ -601,6 +601,7 @@ VFS_NOTIFY_FAM_OBJ = modules/vfs_notify_fam.o
 VFS_READAHEAD_OBJ = modules/vfs_readahead.o
 VFS_TSMSM_OBJ = modules/vfs_tsmsm.o
 VFS_FILEID_OBJ = modules/vfs_fileid.o
+VFS_AIO_FORK_OBJ = modules/vfs_aio_fork.o
 VFS_SYNCOPS_OBJ = modules/vfs_syncops.o
 
 PLAINTEXT_AUTH_OBJ = auth/pampass.o auth/pass_check.o
@@ -1854,6 +1855,10 @@ bin/fileid.@SHLIBEXT@: $(BINARY_PREREQS) $(VFS_FILEID_OBJ)
        @echo "Building plugin $@"
        @$(SHLD_MODULE) $(VFS_FILEID_OBJ)
 
+bin/aio_fork.@SHLIBEXT@: $(BINARY_PREREQS) $(VFS_AIO_FORK_OBJ)
+       @echo "Building plugin $@"
+       @$(SHLD_MODULE) $(VFS_AIO_FORK_OBJ)
+
 #########################################################
 ## IdMap NSS plugins
 
index 5de8a289db4a9aba1cde12ed63956e1a8dfba31c..495da90c33b86c06f9610ad1770f7f64f5f0da44 100644 (file)
@@ -809,6 +809,56 @@ if test x"$samba_cv_unixsocket" = x"yes"; then
    AC_DEFINE(HAVE_UNIXSOCKET,1,[If we need to build with unixsocket support])
 fi
 
+#############################################
+# check for fd passing struct via msg_control
+AC_CACHE_CHECK([for fd passing via msg_control],samba_cv_msghdr_msg_control, [
+    AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <sys/socket.h>
+#include <sys/un.h>],
+[
+       struct msghdr msg;
+       union {
+             struct cmsghdr cm;
+             char control[CMSG_SPACE(sizeof(int))];
+       } control_un;
+       msg.msg_control = control_un.control;
+       msg.msg_controllen = sizeof(control_un.control);
+],
+       samba_cv_msghdr_msg_control=yes,samba_cv_msghdr_msg_control=no)])
+if test x"$samba_cv_msghdr_msg_control" = x"yes"; then
+   AC_DEFINE(HAVE_MSGHDR_MSG_CONTROL,1,
+            [If we can use msg_control for passing file descriptors])
+fi
+
+#############################################
+# check for fd passing struct via msg_acctrights
+AC_CACHE_CHECK([for fd passing via msg_acctrights],
+               samba_cv_msghdr_msg_acctrights, [
+    AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <sys/socket.h>
+#include <sys/un.h>],
+[
+       struct msghdr msg;
+       int fd;
+       msg.msg_acctrights = (caddr_t) &fd;
+       msg.msg_acctrightslen = sizeof(fd);
+],
+       samba_cv_msghdr_msg_acctrights=yes,samba_cv_msghdr_msg_acctrights=no)])
+if test x"$samba_cv_msghdr_msg_acctrights" = x"yes"; then
+   AC_DEFINE(HAVE_MSGHDR_MSG_ACCTRIGHTS,1,
+            [If we can use msg_acctrights for passing file descriptors])
+fi
+
+if test x"$samba_cv_msghdr_msg_control" = x"yes" -o \
+       x"$samba_cv_msghdr_msg_acctright" = x"yes"; then
+       default_shared_modules="$default_shared_modules vfs_aio_fork"
+fi
 
 AC_CACHE_CHECK([for sig_atomic_t type],samba_cv_sig_atomic_t, [
     AC_TRY_COMPILE([
@@ -6038,6 +6088,7 @@ SMB_MODULE(vfs_gpfs, \$(VFS_GPFS_OBJ), "bin/gpfs.$SHLIBEXT", VFS)
 SMB_MODULE(vfs_readahead, \$(VFS_READAHEAD_OBJ), "bin/readahead.$SHLIBEXT", VFS)
 SMB_MODULE(vfs_tsmsm, \$(VFS_TSMSM_OBJ), "bin/tsmsm.$SHLIBEXT", VFS)
 SMB_MODULE(vfs_fileid, \$(VFS_FILEID_OBJ), "bin/fileid.$SHLIBEXT", VFS)
+SMB_MODULE(vfs_aio_fork, \$(VFS_AIO_FORK_OBJ), "bin/aio_fork.$SHLIBEXT", VFS)
 SMB_MODULE(vfs_syncops, \$(VFS_SYNCOPS_OBJ), "bin/syncops.$SHLIBEXT", VFS)
 SMB_MODULE(vfs_zfsacl, \$(VFS_ZFSACL_OBJ), "bin/zfsacl.$SHLIBEXT", VFS)
 SMB_MODULE(vfs_notify_fam, \$(VFS_NOTIFY_FAM_OBJ), "bin/notify_fam.$SHLIBEXT", VFS)
diff --git a/source/modules/vfs_aio_fork.c b/source/modules/vfs_aio_fork.c
new file mode 100644 (file)
index 0000000..21f63d0
--- /dev/null
@@ -0,0 +1,728 @@
+/*
+ * Simulate the Posix AIO using mmap/fork
+ *
+ * Copyright (C) Volker Lendecke 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "includes.h"
+
+struct mmap_area {
+       size_t size;
+       volatile void *ptr;
+};
+
+static int mmap_area_destructor(struct mmap_area *area)
+{
+       munmap((void *)area->ptr, area->size);
+       return 0;
+}
+
+static struct mmap_area *mmap_area_init(TALLOC_CTX *mem_ctx, size_t size)
+{
+       struct mmap_area *result;
+       int fd;
+
+       result = talloc(mem_ctx, struct mmap_area);
+       if (result == NULL) {
+               DEBUG(0, ("talloc failed\n"));
+               goto fail;
+       }
+
+       fd = open("/dev/zero", O_RDWR);
+       if (fd == -1) {
+               DEBUG(3, ("open(\"/dev/zero\") failed: %s\n",
+                         strerror(errno)));
+               goto fail;
+       }
+
+       result->ptr = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                          MAP_SHARED|MAP_FILE, fd, 0);
+       if (result->ptr == MAP_FAILED) {
+               DEBUG(1, ("mmap failed: %s\n", strerror(errno)));
+               goto fail;
+       }
+
+       result->size = size;
+       talloc_set_destructor(result, mmap_area_destructor);
+
+       return result;
+
+fail:
+       TALLOC_FREE(result);
+       return NULL;
+}
+
+struct rw_cmd {
+       size_t n;
+       SMB_OFF_T offset;
+       bool read_cmd;
+};
+
+struct rw_ret {
+       ssize_t size;
+       int ret_errno;
+};
+
+struct aio_child_list;
+
+struct aio_child {
+       struct aio_child *prev, *next;
+       struct aio_child_list *list;
+       SMB_STRUCT_AIOCB *aiocb;
+       pid_t pid;
+       int sockfd;
+       struct fd_event *sock_event;
+       struct rw_ret retval;
+       struct mmap_area *map;  /* ==NULL means write request */
+       bool dont_delete;       /* Marked as in use since last cleanup */
+       bool cancelled;
+       bool read_cmd;
+};
+
+struct aio_child_list {
+       struct aio_child *children;
+       struct timed_event *cleanup_event;
+};
+
+static void free_aio_children(void **p)
+{
+       TALLOC_FREE(*p);
+}
+
+static ssize_t read_fd(int fd, void *ptr, size_t nbytes, int *recvfd)
+{
+       struct msghdr msg;
+       struct iovec iov[1];
+       ssize_t n;
+#ifndef HAVE_MSGHDR_MSG_CONTROL
+       int newfd;
+#endif
+
+#ifdef HAVE_MSGHDR_MSG_CONTROL
+       union {
+         struct cmsghdr        cm;
+         char                          control[CMSG_SPACE(sizeof(int))];
+       } control_un;
+       struct cmsghdr  *cmptr;
+
+       msg.msg_control = control_un.control;
+       msg.msg_controllen = sizeof(control_un.control);
+#else
+#if HAVE_MSGHDR_MSG_ACCTRIGHTS
+       msg.msg_accrights = (caddr_t) &newfd;
+       msg.msg_accrightslen = sizeof(int);
+#else
+#error Can not pass file descriptors
+#endif
+#endif
+
+       msg.msg_name = NULL;
+       msg.msg_namelen = 0;
+
+       iov[0].iov_base = ptr;
+       iov[0].iov_len = nbytes;
+       msg.msg_iov = iov;
+       msg.msg_iovlen = 1;
+
+       if ( (n = recvmsg(fd, &msg, 0)) <= 0) {
+               return(n);
+       }
+
+#ifdef HAVE_MSGHDR_MSG_CONTROL
+       if ((cmptr = CMSG_FIRSTHDR(&msg)) != NULL
+           && cmptr->cmsg_len == CMSG_LEN(sizeof(int))) {
+               if (cmptr->cmsg_level != SOL_SOCKET) {
+                       DEBUG(10, ("control level != SOL_SOCKET"));
+                       errno = EINVAL;
+                       return -1;
+               }
+               if (cmptr->cmsg_type != SCM_RIGHTS) {
+                       DEBUG(10, ("control type != SCM_RIGHTS"));
+                       errno = EINVAL;
+                       return -1;
+               }
+               *recvfd = *((int *) CMSG_DATA(cmptr));
+       } else {
+               *recvfd = -1;           /* descriptor was not passed */
+       }
+#else
+       if (msg.msg_accrightslen == sizeof(int)) {
+               *recvfd = newfd;
+       }
+       else {
+               *recvfd = -1;           /* descriptor was not passed */
+       }
+#endif
+
+       return(n);
+}
+
+static ssize_t write_fd(int fd, void *ptr, size_t nbytes, int sendfd)
+{
+       struct msghdr   msg;
+       struct iovec    iov[1];
+
+#ifdef HAVE_MSGHDR_MSG_CONTROL
+       union {
+               struct cmsghdr  cm;
+               char control[CMSG_SPACE(sizeof(int))];
+       } control_un;
+       struct cmsghdr  *cmptr;
+
+       ZERO_STRUCT(msg);
+       ZERO_STRUCT(control_un);
+
+       msg.msg_control = control_un.control;
+       msg.msg_controllen = sizeof(control_un.control);
+
+       cmptr = CMSG_FIRSTHDR(&msg);
+       cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+       cmptr->cmsg_level = SOL_SOCKET;
+       cmptr->cmsg_type = SCM_RIGHTS;
+       *((int *) CMSG_DATA(cmptr)) = sendfd;
+#else
+       ZERO_STRUCT(msg);
+       msg.msg_accrights = (caddr_t) &sendfd;
+       msg.msg_accrightslen = sizeof(int);
+#endif
+
+       msg.msg_name = NULL;
+       msg.msg_namelen = 0;
+
+       ZERO_STRUCT(iov);
+       iov[0].iov_base = ptr;
+       iov[0].iov_len = nbytes;
+       msg.msg_iov = iov;
+       msg.msg_iovlen = 1;
+
+       return (sendmsg(fd, &msg, 0));
+}
+
+static void aio_child_cleanup(struct event_context *event_ctx,
+                             struct timed_event *te,
+                             const struct timeval *now,
+                             void *private_data)
+{
+       struct aio_child_list *list = talloc_get_type_abort(
+               private_data, struct aio_child_list);
+       struct aio_child *child, *next;
+
+       TALLOC_FREE(list->cleanup_event);
+
+       for (child = list->children; child != NULL; child = next) {
+               next = child->next;
+
+               if (child->aiocb != NULL) {
+                       DEBUG(10, ("child %d currently active\n",
+                                  (int)child->pid));
+                       continue;
+               }
+
+               if (child->dont_delete) {
+                       DEBUG(10, ("Child %d was active since last cleanup\n",
+                                  (int)child->pid));
+                       child->dont_delete = false;
+                       continue;
+               }
+
+               DEBUG(10, ("Child %d idle for more than 30 seconds, "
+                          "deleting\n", (int)child->pid));
+
+               TALLOC_FREE(child);
+       }
+
+       if (list->children != NULL) {
+               /*
+                * Re-schedule the next cleanup round
+                */
+               list->cleanup_event = event_add_timed(smbd_event_context(), list,
+                                                     timeval_add(now, 30, 0),
+                                                     "aio_child_cleanup",
+                                                     aio_child_cleanup, list);
+
+       }
+}
+
+static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle)
+{
+       struct aio_child_list *data = NULL;
+
+       if (SMB_VFS_HANDLE_TEST_DATA(handle)) {
+               SMB_VFS_HANDLE_GET_DATA(handle, data, struct aio_child_list,
+                                       return NULL);
+       }
+
+       if (data == NULL) {
+               data = TALLOC_ZERO_P(NULL, struct aio_child_list);
+               if (data == NULL) {
+                       return NULL;
+               }
+       }
+
+       /*
+        * Regardless of whether the child_list had been around or not, make
+        * sure that we have a cleanup timed event. This timed event will
+        * delete itself when it finds that no children are around anymore.
+        */
+
+       if (data->cleanup_event == NULL) {
+               data->cleanup_event = event_add_timed(smbd_event_context(), data,
+                                                     timeval_current_ofs(30, 0),
+                                                     "aio_child_cleanup",
+                                                     aio_child_cleanup, data);
+               if (data->cleanup_event == NULL) {
+                       TALLOC_FREE(data);
+                       return NULL;
+               }
+       }
+
+       if (!SMB_VFS_HANDLE_TEST_DATA(handle)) {
+               SMB_VFS_HANDLE_SET_DATA(handle, data, free_aio_children,
+                                       struct aio_child_list, return False);
+       }
+
+       return data;
+}
+
+static void aio_child_loop(int sockfd, struct mmap_area *map)
+{
+       while (true) {
+               int fd = -1;
+               ssize_t ret;
+               struct rw_cmd cmd_struct;
+               struct rw_ret ret_struct;
+
+               ret = read_fd(sockfd, &cmd_struct, sizeof(cmd_struct), &fd);
+               if (ret != sizeof(cmd_struct)) {
+                       DEBUG(10, ("read_fd returned %d: %s\n", (int)ret,
+                                  strerror(errno)));
+                       exit(1);
+               }
+
+               DEBUG(10, ("aio_child_loop: %s %d bytes at %d from fd %d\n",
+                          cmd_struct.read_cmd ? "read" : "write",
+                          (int)cmd_struct.n, (int)cmd_struct.offset, fd));
+
+#ifdef ENABLE_BUILD_FARM_HACKS
+               {
+                       /*
+                        * In the build farm, we want erratic behaviour for
+                        * async I/O times
+                        */
+                       uint8_t randval;
+                       unsigned msecs;
+                       /*
+                        * use generate_random_buffer, we just forked from a
+                        * common parent state
+                        */
+                       generate_random_buffer(&randval, sizeof(randval));
+                       msecs = randval + 20;
+                       DEBUG(10, ("delaying for %u msecs\n", msecs));
+                       smb_msleep(msecs);
+               }
+#endif
+
+
+               ZERO_STRUCT(ret_struct);
+
+               if (cmd_struct.read_cmd) {
+                       ret_struct.size = sys_pread(
+                               fd, (void *)map->ptr, cmd_struct.n,
+                               cmd_struct.offset);
+               }
+               else {
+                       ret_struct.size = sys_pwrite(
+                               fd, (void *)map->ptr, cmd_struct.n,
+                               cmd_struct.offset);
+               }
+
+               DEBUG(10, ("aio_child_loop: syscall returned %d\n",
+                          (int)ret_struct.size));
+
+               if (ret_struct.size == -1) {
+                       ret_struct.ret_errno = errno;
+               }
+
+               ret = write_data(sockfd, (char *)&ret_struct,
+                                sizeof(ret_struct));
+               if (ret != sizeof(ret_struct)) {
+                       DEBUG(10, ("could not write ret_struct: %s\n",
+                                  strerror(errno)));
+                       exit(2);
+               }
+
+               close(fd);
+       }
+}
+
+static void handle_aio_completion(struct event_context *event_ctx,
+                                 struct fd_event *event, uint16 flags,
+                                 void *p)
+{
+       struct aio_child *child = (struct aio_child *)p;
+       uint16 mid;
+
+       DEBUG(10, ("handle_aio_completion called with flags=%d\n", flags));
+
+       if ((flags & EVENT_FD_READ) == 0) {
+               return;
+       }
+
+       if (!NT_STATUS_IS_OK(read_data(child->sockfd,
+                                      (char *)&child->retval,
+                                      sizeof(child->retval)))) {
+               DEBUG(0, ("aio child %d died\n", (int)child->pid));
+               child->retval.size = -1;
+               child->retval.ret_errno = EIO;
+       }
+
+       if (child->cancelled) {
+               child->aiocb = NULL;
+               child->cancelled = false;
+               return;
+       }
+
+       if (child->read_cmd && (child->retval.size > 0)) {
+               SMB_ASSERT(child->retval.size <= child->aiocb->aio_nbytes);
+               memcpy((void *)child->aiocb->aio_buf, (void *)child->map->ptr,
+                      child->retval.size);
+       }
+
+       mid = child->aiocb->aio_sigevent.sigev_value.sival_int;
+
+       DEBUG(10, ("mid %d finished\n", (int)mid));
+
+       aio_request_done(mid);
+       process_aio_queue();
+}
+
+static int aio_child_destructor(struct aio_child *child)
+{
+       SMB_ASSERT((child->aiocb == NULL) || child->cancelled);
+       close(child->sockfd);
+       DLIST_REMOVE(child->list->children, child);
+       return 0;
+}
+
+static NTSTATUS create_aio_child(struct aio_child_list *children,
+                                size_t map_size,
+                                struct aio_child **presult)
+{
+       struct aio_child *result;
+       int fdpair[2];
+       NTSTATUS status;
+
+       fdpair[0] = fdpair[1] = -1;
+
+       result = TALLOC_ZERO_P(children, struct aio_child);
+       NT_STATUS_HAVE_NO_MEMORY(result);
+
+       if (socketpair(AF_UNIX, SOCK_STREAM, 0, fdpair) == -1) {
+               status = map_nt_error_from_unix(errno);
+               DEBUG(10, ("socketpair() failed: %s\n", strerror(errno)));
+               TALLOC_FREE(result);
+               goto fail;
+       }
+
+       DEBUG(10, ("fdpair = %d/%d\n", fdpair[0], fdpair[1]));
+
+       result->map = mmap_area_init(result, map_size);
+       if (result->map == NULL) {
+               status = map_nt_error_from_unix(errno);
+               DEBUG(0, ("Could not create mmap area\n"));
+               goto fail;
+       }
+
+       result->pid = sys_fork();
+       if (result->pid == -1) {
+               status = map_nt_error_from_unix(errno);
+               DEBUG(0, ("fork failed: %s\n", strerror(errno)));
+               goto fail;
+       }
+
+       if (result->pid == 0) {
+               close(fdpair[0]);
+               result->sockfd = fdpair[1];
+               aio_child_loop(result->sockfd, result->map);
+       }
+
+       DEBUG(10, ("Child %d created\n", result->pid));
+
+       result->sockfd = fdpair[0];
+       close(fdpair[1]);
+
+       result->sock_event = event_add_fd(smbd_event_context(), result,
+                                         result->sockfd, EVENT_FD_READ,
+                                         handle_aio_completion,
+                                         result);
+       if (result->sock_event == NULL) {
+               status = NT_STATUS_NO_MEMORY;
+               DEBUG(0, ("event_add_fd failed\n"));
+               goto fail;
+       }
+
+       result->list = children;
+       DLIST_ADD(children->children, result);
+
+       talloc_set_destructor(result, aio_child_destructor);
+
+       *presult = result;
+
+       return NT_STATUS_OK;
+
+ fail:
+       if (fdpair[0] != -1) close(fdpair[0]);
+       if (fdpair[1] != -1) close(fdpair[1]);
+       TALLOC_FREE(result);
+
+       return status;
+}
+
+static NTSTATUS get_idle_child(struct vfs_handle_struct *handle,
+                              struct aio_child **pchild)
+{
+       struct aio_child_list *children;
+       struct aio_child *child;
+       NTSTATUS status;
+
+       children = init_aio_children(handle);
+       if (children == NULL) {
+               return NT_STATUS_NO_MEMORY;
+       }
+
+       for (child = children->children; child != NULL; child = child->next) {
+               if (child->aiocb == NULL) {
+                       /* idle */
+                       break;
+               }
+       }
+
+       if (child == NULL) {
+               DEBUG(10, ("no idle child found, creating new one\n"));
+
+               status = create_aio_child(children, 128*1024, &child);
+               if (!NT_STATUS_IS_OK(status)) {
+                       DEBUG(10, ("create_aio_child failed: %s\n",
+                                  nt_errstr(status)));
+                       return status;
+               }
+       }
+
+       child->dont_delete = true;
+
+       *pchild = child;
+       return NT_STATUS_OK;
+}
+
+static int aio_fork_read(struct vfs_handle_struct *handle,
+                        struct files_struct *fsp, SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_child *child;
+       struct rw_cmd cmd;
+       ssize_t ret;
+       NTSTATUS status;
+
+       if (aiocb->aio_nbytes > 128*1024) {
+               /* TODO: support variable buffers */
+               errno = EINVAL;
+               return -1;
+       }
+
+       status = get_idle_child(handle, &child);
+       if (!NT_STATUS_IS_OK(status)) {
+               DEBUG(10, ("Could not get an idle child\n"));
+               return -1;
+       }
+
+       child->read_cmd = true;
+       child->aiocb = aiocb;
+       child->retval.ret_errno = EINPROGRESS;
+
+       ZERO_STRUCT(cmd);
+       cmd.n = aiocb->aio_nbytes;
+       cmd.offset = aiocb->aio_offset;
+       cmd.read_cmd = child->read_cmd;
+
+       DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
+                  (int)child->pid));
+
+       ret = write_fd(child->sockfd, &cmd, sizeof(cmd), fsp->fh->fd);
+       if (ret == -1) {
+               DEBUG(10, ("write_fd failed: %s\n", strerror(errno)));
+               return -1;
+       }
+
+       return 0;
+}
+
+static int aio_fork_write(struct vfs_handle_struct *handle,
+                         struct files_struct *fsp, SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_child *child;
+       struct rw_cmd cmd;
+       ssize_t ret;
+       NTSTATUS status;
+
+       if (aiocb->aio_nbytes > 128*1024) {
+               /* TODO: support variable buffers */
+               errno = EINVAL;
+               return -1;
+       }
+
+       status = get_idle_child(handle, &child);
+       if (!NT_STATUS_IS_OK(status)) {
+               DEBUG(10, ("Could not get an idle child\n"));
+               return -1;
+       }
+
+       child->read_cmd = false;
+       child->aiocb = aiocb;
+       child->retval.ret_errno = EINPROGRESS;
+
+       memcpy((void *)child->map->ptr, (void *)aiocb->aio_buf,
+              aiocb->aio_nbytes);
+
+       ZERO_STRUCT(cmd);
+       cmd.n = aiocb->aio_nbytes;
+       cmd.offset = aiocb->aio_offset;
+       cmd.read_cmd = child->read_cmd;
+
+       DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
+                  (int)child->pid));
+
+       ret = write_fd(child->sockfd, &cmd, sizeof(cmd), fsp->fh->fd);
+       if (ret == -1) {
+               DEBUG(10, ("write_fd failed: %s\n", strerror(errno)));
+               return -1;
+       }
+
+       return 0;
+}
+
+static struct aio_child *aio_fork_find_child(struct vfs_handle_struct *handle,
+                                            SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_child_list *children;
+       struct aio_child *child;
+
+       children = init_aio_children(handle);
+       if (children == NULL) {
+               return NULL;
+       }
+
+       for (child = children->children; child != NULL; child = child->next) {
+               if (child->aiocb == aiocb) {
+                       return child;
+               }
+       }
+
+       return NULL;
+}
+
+static ssize_t aio_fork_return_fn(struct vfs_handle_struct *handle,
+                                 struct files_struct *fsp,
+                                 SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_child *child = aio_fork_find_child(handle, aiocb);
+
+       if (child == NULL) {
+               errno = EINVAL;
+               DEBUG(0, ("returning EINVAL\n"));
+               return -1;
+       }
+
+       child->aiocb = NULL;
+
+       if (child->retval.size == -1) {
+               errno = child->retval.ret_errno;
+       }
+
+       return child->retval.size;
+}
+
+static int aio_fork_cancel(struct vfs_handle_struct *handle,
+                          struct files_struct *fsp,
+                          SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_child_list *children;
+       struct aio_child *child;
+
+       children = init_aio_children(handle);
+       if (children == NULL) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       for (child = children->children; child != NULL; child = child->next) {
+               if (child->aiocb == NULL) {
+                       continue;
+               }
+               if (child->aiocb->aio_fildes != fsp->fh->fd) {
+                       continue;
+               }
+               if ((aiocb != NULL) && (child->aiocb != aiocb)) {
+                       continue;
+               }
+
+               /*
+                * We let the child do its job, but we discard the result when
+                * it's finished.
+                */
+
+               child->cancelled = true;
+       }
+
+       return AIO_CANCELED;
+}
+
+static int aio_fork_error_fn(struct vfs_handle_struct *handle,
+                            struct files_struct *fsp,
+                            SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_child *child = aio_fork_find_child(handle, aiocb);
+
+       if (child == NULL) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       return child->retval.ret_errno;
+}
+
+/* VFS operations structure */
+
+static vfs_op_tuple aio_fork_ops[] = {
+       {SMB_VFS_OP(aio_fork_read),     SMB_VFS_OP_AIO_READ,
+        SMB_VFS_LAYER_TRANSPARENT},
+       {SMB_VFS_OP(aio_fork_write),    SMB_VFS_OP_AIO_WRITE,
+        SMB_VFS_LAYER_TRANSPARENT},
+       {SMB_VFS_OP(aio_fork_return_fn), SMB_VFS_OP_AIO_RETURN,
+        SMB_VFS_LAYER_TRANSPARENT},
+       {SMB_VFS_OP(aio_fork_cancel),   SMB_VFS_OP_AIO_CANCEL,
+        SMB_VFS_LAYER_TRANSPARENT},
+       {SMB_VFS_OP(aio_fork_error_fn), SMB_VFS_OP_AIO_ERROR,
+        SMB_VFS_LAYER_TRANSPARENT},
+       {SMB_VFS_OP(NULL),              SMB_VFS_OP_NOOP,
+        SMB_VFS_LAYER_NOOP}
+};
+
+NTSTATUS vfs_aio_fork_init(void);
+NTSTATUS vfs_aio_fork_init(void)
+{
+       return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
+                               "aio_fork", aio_fork_ops);
+}