s3:vfs: Add missing space in debug message
[samba.git] / source3 / modules / vfs_io_uring.c
1 /*
2  * Use the io_uring of Linux (>= 5.1)
3  *
4  * Copyright (C) Volker Lendecke 2008
5  * Copyright (C) Jeremy Allison 2010
6  * Copyright (C) Stefan Metzmacher 2019
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  */
22
23 #include "replace.h"
24
25 /*
26  * liburing.h only needs a forward declaration
27  * of struct open_how.
28  *
29  * If struct open_how is defined in liburing/compat.h
30  * itself, hide it away in order to avoid conflicts
31  * with including linux/openat2.h or defining 'struct open_how'
32  * in libreplace.
33  */
34 struct open_how;
35 #ifdef HAVE_STRUCT_OPEN_HOW_LIBURING_COMPAT_H
36 #define open_how __ignore_liburing_compat_h_open_how
37 #include <liburing/compat.h>
38 #undef open_how
39 #endif /* HAVE_STRUCT_OPEN_HOW_LIBURING_COMPAT_H */
40
41 #include "includes.h"
42 #include "system/filesys.h"
43 #include "smbd/smbd.h"
44 #include "smbd/globals.h"
45 #include "lib/util/tevent_unix.h"
46 #include "lib/util/sys_rw.h"
47 #include "lib/util/iov_buf.h"
48 #include "smbprofile.h"
49 #include <liburing.h>
50
51 struct vfs_io_uring_request;
52
53 struct vfs_io_uring_config {
54         struct io_uring uring;
55         struct tevent_fd *fde;
56         /* recursion guard. See comment above vfs_io_uring_queue_run() */
57         bool busy;
58         /* recursion guard. See comment above vfs_io_uring_queue_run() */
59         bool need_retry;
60         struct vfs_io_uring_request *queue;
61         struct vfs_io_uring_request *pending;
62 };
63
64 struct vfs_io_uring_request {
65         struct vfs_io_uring_request *prev, *next;
66         struct vfs_io_uring_request **list_head;
67         struct vfs_io_uring_config *config;
68         struct tevent_req *req;
69         void (*completion_fn)(struct vfs_io_uring_request *cur,
70                               const char *location);
71         struct timespec start_time;
72         struct timespec end_time;
73         SMBPROFILE_BYTES_ASYNC_STATE(profile_bytes);
74         struct io_uring_sqe sqe;
75         struct io_uring_cqe cqe;
76 };
77
78 static void vfs_io_uring_finish_req(struct vfs_io_uring_request *cur,
79                                     const struct io_uring_cqe *cqe,
80                                     struct timespec end_time,
81                                     const char *location)
82 {
83         struct tevent_req *req =
84                 talloc_get_type_abort(cur->req,
85                 struct tevent_req);
86         void *state = _tevent_req_data(req);
87
88         talloc_set_destructor(state, NULL);
89         if (cur->list_head != NULL) {
90                 DLIST_REMOVE((*cur->list_head), cur);
91                 cur->list_head = NULL;
92         }
93         cur->cqe = *cqe;
94
95         SMBPROFILE_BYTES_ASYNC_SET_IDLE(cur->profile_bytes);
96         cur->end_time = end_time;
97
98         /*
99          * We rely on being inside the _send() function
100          * or tevent_req_defer_callback() being called
101          * already.
102          */
103         cur->completion_fn(cur, location);
104 }
105
106 static void vfs_io_uring_config_destroy(struct vfs_io_uring_config *config,
107                                         int ret,
108                                         const char *location)
109 {
110         struct vfs_io_uring_request *cur = NULL, *next = NULL;
111         struct timespec start_time;
112         struct timespec end_time;
113         struct io_uring_cqe err_cqe = {
114                 .res = ret,
115         };
116
117         PROFILE_TIMESTAMP(&start_time);
118
119         if (config->uring.ring_fd != -1) {
120                 /* TODO: cancel queued and pending requests */
121                 TALLOC_FREE(config->fde);
122                 io_uring_queue_exit(&config->uring);
123                 config->uring.ring_fd = -1;
124         }
125
126         PROFILE_TIMESTAMP(&end_time);
127
128         for (cur = config->pending; cur != NULL; cur = next) {
129                 next = cur->next;
130                 err_cqe.user_data = (uintptr_t)(void *)cur;
131                 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
132         }
133
134         for (cur = config->queue; cur != NULL; cur = next) {
135                 next = cur->next;
136                 err_cqe.user_data = (uintptr_t)(void *)cur;
137                 cur->start_time = start_time;
138                 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
139         }
140 }
141
142 static int vfs_io_uring_config_destructor(struct vfs_io_uring_config *config)
143 {
144         vfs_io_uring_config_destroy(config, -EUCLEAN, __location__);
145         return 0;
146 }
147
148 static int vfs_io_uring_request_state_deny_destructor(void *_state)
149 {
150         struct __vfs_io_uring_generic_state {
151                 struct vfs_io_uring_request ur;
152         } *state = (struct __vfs_io_uring_generic_state *)_state;
153         struct vfs_io_uring_request *cur = &state->ur;
154
155         /* our parent is gone */
156         cur->req = NULL;
157
158         /* remove ourself from any list */
159         DLIST_REMOVE((*cur->list_head), cur);
160         cur->list_head = NULL;
161
162         /*
163          * Our state is about to go away,
164          * all we can do is shutting down the whole uring.
165          * But that's ok as we're most likely called from exit_server()
166          */
167         vfs_io_uring_config_destroy(cur->config, -ESHUTDOWN, __location__);
168         return 0;
169 }
170
171 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
172                                     struct tevent_fd *fde,
173                                     uint16_t flags,
174                                     void *private_data);
175
176 static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
177                             const char *user)
178 {
179         int ret;
180         struct vfs_io_uring_config *config;
181         unsigned num_entries;
182         bool sqpoll;
183         unsigned flags = 0;
184
185         config = talloc_zero(handle->conn, struct vfs_io_uring_config);
186         if (config == NULL) {
187                 DEBUG(0, ("talloc_zero() failed\n"));
188                 return -1;
189         }
190
191         SMB_VFS_HANDLE_SET_DATA(handle, config,
192                                 NULL, struct vfs_io_uring_config,
193                                 return -1);
194
195         ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
196         if (ret < 0) {
197                 return ret;
198         }
199
200         num_entries = lp_parm_ulong(SNUM(handle->conn),
201                                     "io_uring",
202                                     "num_entries",
203                                     128);
204         num_entries = MAX(num_entries, 1);
205
206         sqpoll = lp_parm_bool(SNUM(handle->conn),
207                              "io_uring",
208                              "sqpoll",
209                              false);
210         if (sqpoll) {
211                 flags |= IORING_SETUP_SQPOLL;
212         }
213
214         ret = io_uring_queue_init(num_entries, &config->uring, flags);
215         if (ret < 0) {
216                 SMB_VFS_NEXT_DISCONNECT(handle);
217                 errno = -ret;
218                 return -1;
219         }
220
221         talloc_set_destructor(config, vfs_io_uring_config_destructor);
222
223 #ifdef HAVE_IO_URING_RING_DONTFORK
224         ret = io_uring_ring_dontfork(&config->uring);
225         if (ret < 0) {
226                 SMB_VFS_NEXT_DISCONNECT(handle);
227                 errno = -ret;
228                 return -1;
229         }
230 #endif /* HAVE_IO_URING_RING_DONTFORK */
231
232         config->fde = tevent_add_fd(handle->conn->sconn->ev_ctx,
233                                     config,
234                                     config->uring.ring_fd,
235                                     TEVENT_FD_READ,
236                                     vfs_io_uring_fd_handler,
237                                     handle);
238         if (config->fde == NULL) {
239                 ret = errno;
240                 SMB_VFS_NEXT_DISCONNECT(handle);
241                 errno = ret;
242                 return -1;
243         }
244
245         return 0;
246 }
247
248 static void _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
249 {
250         struct vfs_io_uring_request *cur = NULL, *next = NULL;
251         struct io_uring_cqe *cqe = NULL;
252         unsigned cqhead;
253         unsigned nr = 0;
254         struct timespec start_time;
255         struct timespec end_time;
256         int ret;
257
258         PROFILE_TIMESTAMP(&start_time);
259
260         if (config->uring.ring_fd == -1) {
261                 vfs_io_uring_config_destroy(config, -ESTALE, __location__);
262                 return;
263         }
264
265         for (cur = config->queue; cur != NULL; cur = next) {
266                 struct io_uring_sqe *sqe = NULL;
267                 void *state = _tevent_req_data(cur->req);
268
269                 next = cur->next;
270
271                 sqe = io_uring_get_sqe(&config->uring);
272                 if (sqe == NULL) {
273                         break;
274                 }
275
276                 talloc_set_destructor(state,
277                         vfs_io_uring_request_state_deny_destructor);
278                 DLIST_REMOVE(config->queue, cur);
279                 *sqe = cur->sqe;
280                 DLIST_ADD_END(config->pending, cur);
281                 cur->list_head = &config->pending;
282                 SMBPROFILE_BYTES_ASYNC_SET_BUSY(cur->profile_bytes);
283
284                 cur->start_time = start_time;
285         }
286
287         ret = io_uring_submit(&config->uring);
288         if (ret == -EAGAIN || ret == -EBUSY) {
289                 /* We just retry later */
290         } else if (ret < 0) {
291                 vfs_io_uring_config_destroy(config, ret, __location__);
292                 return;
293         }
294
295         PROFILE_TIMESTAMP(&end_time);
296
297         io_uring_for_each_cqe(&config->uring, cqhead, cqe) {
298                 cur = (struct vfs_io_uring_request *)io_uring_cqe_get_data(cqe);
299                 vfs_io_uring_finish_req(cur, cqe, end_time, __location__);
300                 nr++;
301         }
302
303         io_uring_cq_advance(&config->uring, nr);
304 }
305
306 /*
307  * Wrapper function to prevent recursion which could happen
308  * if we called _vfs_io_uring_queue_run() directly without
309  * recursion checks.
310  *
311  * Looking at the pread call, we can have:
312  *
313  * vfs_io_uring_pread_send()
314  *        ->vfs_io_uring_pread_submit()  <-----------------------------------
315  *                ->vfs_io_uring_request_submit()                           |
316  *                        ->vfs_io_uring_queue_run()                        |
317  *                                ->_vfs_io_uring_queue_run()               |
318  *                                                                          |
319  * But inside _vfs_io_uring_queue_run() looks like:                         |
320  *                                                                          |
321  * _vfs_io_uring_queue_run() {                                              |
322  *      if (THIS_IO_COMPLETED) {                                            |
323  *              ->vfs_io_uring_finish_req()                                 |
324  *                      ->cur->completion_fn()                              |
325  *      }                                                                   |
326  * }                                                                        |
327  *                                                                          |
328  * cur->completion_fn() for pread is set to vfs_io_uring_pread_completion() |
329  *                                                                          |
330  * vfs_io_uring_pread_completion() {                                        |
331  *      if (READ_TERMINATED) {                                              |
332  *              -> tevent_req_done() - We're done, go back up the stack.    |
333  *              return;                                                     |
334  *      }                                                                   |
335  *                                                                          |
336  *      We have a short read - adjust the io vectors                        |
337  *                                                                          |
338  *      ->vfs_io_uring_pread_submit() ---------------------------------------
339  * }
340  *
341  * So before calling _vfs_io_uring_queue_run() we backet it with setting
342  * a flag config->busy, and unset it once _vfs_io_uring_queue_run() finally
343  * exits the retry loop.
344  *
345  * If we end up back into vfs_io_uring_queue_run() we notice we've done so
346  * as config->busy is set and don't recurse into _vfs_io_uring_queue_run().
347  *
348  * We set the second flag config->need_retry that tells us to loop in the
349  * vfs_io_uring_queue_run() call above us in the stack and return.
350  *
351  * When the outer call to _vfs_io_uring_queue_run() returns we are in
352  * a loop checking if config->need_retry was set. That happens if
353  * the short read case occurs and _vfs_io_uring_queue_run() ended up
354  * recursing into vfs_io_uring_queue_run().
355  *
356  * Once vfs_io_uring_pread_completion() finishes without a short
357  * read (the READ_TERMINATED case, tevent_req_done() is called)
358  * then config->need_retry is left as false, we exit the loop,
359  * set config->busy to false so the next top level call into
360  * vfs_io_uring_queue_run() won't think it's a recursed call
361  * and return.
362  *
363  */
364
365 static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
366 {
367         if (config->busy) {
368                 /*
369                  * We've recursed due to short read/write.
370                  * Set need_retry to ensure we retry the
371                  * io_uring_submit().
372                  */
373                 config->need_retry = true;
374                 return;
375         }
376
377         /*
378          * Bracket the loop calling _vfs_io_uring_queue_run()
379          * with busy = true / busy = false.
380          * so we can detect recursion above.
381          */
382
383         config->busy = true;
384
385         do {
386                 config->need_retry = false;
387                 _vfs_io_uring_queue_run(config);
388         } while (config->need_retry);
389
390         config->busy = false;
391 }
392
393 static void vfs_io_uring_request_submit(struct vfs_io_uring_request *cur)
394 {
395         struct vfs_io_uring_config *config = cur->config;
396
397         io_uring_sqe_set_data(&cur->sqe, cur);
398         DLIST_ADD_END(config->queue, cur);
399         cur->list_head = &config->queue;
400
401         vfs_io_uring_queue_run(config);
402 }
403
404 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
405                                     struct tevent_fd *fde,
406                                     uint16_t flags,
407                                     void *private_data)
408 {
409         vfs_handle_struct *handle = (vfs_handle_struct *)private_data;
410         struct vfs_io_uring_config *config = NULL;
411
412         SMB_VFS_HANDLE_GET_DATA(handle, config,
413                                 struct vfs_io_uring_config,
414                                 smb_panic(__location__));
415
416         vfs_io_uring_queue_run(config);
417 }
418
419 struct vfs_io_uring_pread_state {
420         struct files_struct *fsp;
421         off_t offset;
422         struct iovec iov;
423         size_t nread;
424         struct vfs_io_uring_request ur;
425 };
426
427 static void vfs_io_uring_pread_submit(struct vfs_io_uring_pread_state *state);
428 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
429                                           const char *location);
430
431 static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *handle,
432                                              TALLOC_CTX *mem_ctx,
433                                              struct tevent_context *ev,
434                                              struct files_struct *fsp,
435                                              void *data,
436                                              size_t n, off_t offset)
437 {
438         struct tevent_req *req = NULL;
439         struct vfs_io_uring_pread_state *state = NULL;
440         struct vfs_io_uring_config *config = NULL;
441         bool ok;
442
443         SMB_VFS_HANDLE_GET_DATA(handle, config,
444                                 struct vfs_io_uring_config,
445                                 smb_panic(__location__));
446
447         req = tevent_req_create(mem_ctx, &state,
448                                 struct vfs_io_uring_pread_state);
449         if (req == NULL) {
450                 return NULL;
451         }
452         state->ur.config = config;
453         state->ur.req = req;
454         state->ur.completion_fn = vfs_io_uring_pread_completion;
455
456         SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pread, profile_p,
457                                      state->ur.profile_bytes, n);
458         SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
459
460         ok = sys_valid_io_range(offset, n);
461         if (!ok) {
462                 tevent_req_error(req, EINVAL);
463                 return tevent_req_post(req, ev);
464         }
465
466         state->fsp = fsp;
467         state->offset = offset;
468         state->iov.iov_base = (void *)data;
469         state->iov.iov_len = n;
470         vfs_io_uring_pread_submit(state);
471
472         if (!tevent_req_is_in_progress(req)) {
473                 return tevent_req_post(req, ev);
474         }
475
476         tevent_req_defer_callback(req, ev);
477         return req;
478 }
479
480 static void vfs_io_uring_pread_submit(struct vfs_io_uring_pread_state *state)
481 {
482         io_uring_prep_readv(&state->ur.sqe,
483                             fsp_get_io_fd(state->fsp),
484                             &state->iov, 1,
485                             state->offset);
486         vfs_io_uring_request_submit(&state->ur);
487 }
488
489 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
490                                           const char *location)
491 {
492         struct vfs_io_uring_pread_state *state = tevent_req_data(
493                 cur->req, struct vfs_io_uring_pread_state);
494         struct iovec *iov = &state->iov;
495         int num_iov = 1;
496         bool ok;
497
498         /*
499          * We rely on being inside the _send() function
500          * or tevent_req_defer_callback() being called
501          * already.
502          */
503
504         if (cur->cqe.res < 0) {
505                 int err = -cur->cqe.res;
506                 _tevent_req_error(cur->req, err, location);
507                 return;
508         }
509
510         if (cur->cqe.res == 0) {
511                 /*
512                  * We reached EOF, we're done
513                  */
514                 tevent_req_done(cur->req);
515                 return;
516         }
517
518         ok = iov_advance(&iov, &num_iov, cur->cqe.res);
519         if (!ok) {
520                 /* This is not expected! */
521                 DBG_ERR("iov_advance() failed cur->cqe.res=%d > iov_len=%d\n",
522                         (int)cur->cqe.res,
523                         (int)state->iov.iov_len);
524                 tevent_req_error(cur->req, EIO);
525                 return;
526         }
527
528         /* sys_valid_io_range() already checked the boundaries */
529         state->nread += state->ur.cqe.res;
530         if (num_iov == 0) {
531                 /* We're done */
532                 tevent_req_done(cur->req);
533                 return;
534         }
535
536         /*
537          * sys_valid_io_range() already checked the boundaries
538          * now try to get the rest.
539          */
540         state->offset += state->ur.cqe.res;
541         vfs_io_uring_pread_submit(state);
542 }
543
544 static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
545                                   struct vfs_aio_state *vfs_aio_state)
546 {
547         struct vfs_io_uring_pread_state *state = tevent_req_data(
548                 req, struct vfs_io_uring_pread_state);
549         ssize_t ret;
550
551         SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
552         vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
553                                                  &state->ur.start_time);
554
555         if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
556                 tevent_req_received(req);
557                 return -1;
558         }
559
560         vfs_aio_state->error = 0;
561         ret = state->nread;
562
563         tevent_req_received(req);
564         return ret;
565 }
566
567 struct vfs_io_uring_pwrite_state {
568         struct files_struct *fsp;
569         off_t offset;
570         struct iovec iov;
571         size_t nwritten;
572         struct vfs_io_uring_request ur;
573 };
574
575 static void vfs_io_uring_pwrite_submit(struct vfs_io_uring_pwrite_state *state);
576 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
577                                            const char *location);
578
579 static struct tevent_req *vfs_io_uring_pwrite_send(struct vfs_handle_struct *handle,
580                                               TALLOC_CTX *mem_ctx,
581                                               struct tevent_context *ev,
582                                               struct files_struct *fsp,
583                                               const void *data,
584                                               size_t n, off_t offset)
585 {
586         struct tevent_req *req = NULL;
587         struct vfs_io_uring_pwrite_state *state = NULL;
588         struct vfs_io_uring_config *config = NULL;
589         bool ok;
590
591         SMB_VFS_HANDLE_GET_DATA(handle, config,
592                                 struct vfs_io_uring_config,
593                                 smb_panic(__location__));
594
595         req = tevent_req_create(mem_ctx, &state,
596                                 struct vfs_io_uring_pwrite_state);
597         if (req == NULL) {
598                 return NULL;
599         }
600         state->ur.config = config;
601         state->ur.req = req;
602         state->ur.completion_fn = vfs_io_uring_pwrite_completion;
603
604         SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pwrite, profile_p,
605                                      state->ur.profile_bytes, n);
606         SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
607
608         ok = sys_valid_io_range(offset, n);
609         if (!ok) {
610                 tevent_req_error(req, EINVAL);
611                 return tevent_req_post(req, ev);
612         }
613
614         state->fsp = fsp;
615         state->offset = offset;
616         state->iov.iov_base = discard_const(data);
617         state->iov.iov_len = n;
618         vfs_io_uring_pwrite_submit(state);
619
620         if (!tevent_req_is_in_progress(req)) {
621                 return tevent_req_post(req, ev);
622         }
623
624         tevent_req_defer_callback(req, ev);
625         return req;
626 }
627
628 static void vfs_io_uring_pwrite_submit(struct vfs_io_uring_pwrite_state *state)
629 {
630         io_uring_prep_writev(&state->ur.sqe,
631                              fsp_get_io_fd(state->fsp),
632                              &state->iov, 1,
633                              state->offset);
634         vfs_io_uring_request_submit(&state->ur);
635 }
636
637 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
638                                            const char *location)
639 {
640         struct vfs_io_uring_pwrite_state *state = tevent_req_data(
641                 cur->req, struct vfs_io_uring_pwrite_state);
642         struct iovec *iov = &state->iov;
643         int num_iov = 1;
644         bool ok;
645
646         /*
647          * We rely on being inside the _send() function
648          * or tevent_req_defer_callback() being called
649          * already.
650          */
651
652         if (cur->cqe.res < 0) {
653                 int err = -cur->cqe.res;
654                 _tevent_req_error(cur->req, err, location);
655                 return;
656         }
657
658         if (cur->cqe.res == 0) {
659                 /*
660                  * Ensure we can never spin.
661                  */
662                 tevent_req_error(cur->req, ENOSPC);
663                 return;
664         }
665
666         ok = iov_advance(&iov, &num_iov, cur->cqe.res);
667         if (!ok) {
668                 /* This is not expected! */
669                 DBG_ERR("iov_advance() failed cur->cqe.res=%d > iov_len=%d\n",
670                         (int)cur->cqe.res,
671                         (int)state->iov.iov_len);
672                 tevent_req_error(cur->req, EIO);
673                 return;
674         }
675
676         /* sys_valid_io_range() already checked the boundaries */
677         state->nwritten += state->ur.cqe.res;
678         if (num_iov == 0) {
679                 /* We're done */
680                 tevent_req_done(cur->req);
681                 return;
682         }
683
684         /*
685          * sys_valid_io_range() already checked the boundaries
686          * now try to write the rest.
687          */
688         state->offset += state->ur.cqe.res;
689         vfs_io_uring_pwrite_submit(state);
690 }
691
692 static ssize_t vfs_io_uring_pwrite_recv(struct tevent_req *req,
693                                    struct vfs_aio_state *vfs_aio_state)
694 {
695         struct vfs_io_uring_pwrite_state *state = tevent_req_data(
696                 req, struct vfs_io_uring_pwrite_state);
697         ssize_t ret;
698
699         SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
700         vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
701                                                  &state->ur.start_time);
702
703         if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
704                 tevent_req_received(req);
705                 return -1;
706         }
707
708         vfs_aio_state->error = 0;
709         ret = state->nwritten;
710
711         tevent_req_received(req);
712         return ret;
713 }
714
715 struct vfs_io_uring_fsync_state {
716         struct vfs_io_uring_request ur;
717 };
718
719 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
720                                           const char *location);
721
722 static struct tevent_req *vfs_io_uring_fsync_send(struct vfs_handle_struct *handle,
723                                              TALLOC_CTX *mem_ctx,
724                                              struct tevent_context *ev,
725                                              struct files_struct *fsp)
726 {
727         struct tevent_req *req = NULL;
728         struct vfs_io_uring_fsync_state *state = NULL;
729         struct vfs_io_uring_config *config = NULL;
730
731         SMB_VFS_HANDLE_GET_DATA(handle, config,
732                                 struct vfs_io_uring_config,
733                                 smb_panic(__location__));
734
735         req = tevent_req_create(mem_ctx, &state,
736                                 struct vfs_io_uring_fsync_state);
737         if (req == NULL) {
738                 return NULL;
739         }
740         state->ur.config = config;
741         state->ur.req = req;
742         state->ur.completion_fn = vfs_io_uring_fsync_completion;
743
744         SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
745                                      state->ur.profile_bytes, 0);
746         SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
747
748         io_uring_prep_fsync(&state->ur.sqe,
749                             fsp_get_io_fd(fsp),
750                             0); /* fsync_flags */
751         vfs_io_uring_request_submit(&state->ur);
752
753         if (!tevent_req_is_in_progress(req)) {
754                 return tevent_req_post(req, ev);
755         }
756
757         tevent_req_defer_callback(req, ev);
758         return req;
759 }
760
761 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
762                                           const char *location)
763 {
764         /*
765          * We rely on being inside the _send() function
766          * or tevent_req_defer_callback() being called
767          * already.
768          */
769
770         if (cur->cqe.res < 0) {
771                 int err = -cur->cqe.res;
772                 _tevent_req_error(cur->req, err, location);
773                 return;
774         }
775
776         if (cur->cqe.res > 0) {
777                 /* This is not expected! */
778                 DBG_ERR("got cur->cqe.res=%d\n", (int)cur->cqe.res);
779                 tevent_req_error(cur->req, EIO);
780                 return;
781         }
782
783         tevent_req_done(cur->req);
784 }
785
786 static int vfs_io_uring_fsync_recv(struct tevent_req *req,
787                               struct vfs_aio_state *vfs_aio_state)
788 {
789         struct vfs_io_uring_fsync_state *state = tevent_req_data(
790                 req, struct vfs_io_uring_fsync_state);
791
792         SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
793         vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
794                                                  &state->ur.start_time);
795
796         if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
797                 tevent_req_received(req);
798                 return -1;
799         }
800
801         vfs_aio_state->error = 0;
802
803         tevent_req_received(req);
804         return 0;
805 }
806
807 static struct vfs_fn_pointers vfs_io_uring_fns = {
808         .connect_fn = vfs_io_uring_connect,
809         .pread_send_fn = vfs_io_uring_pread_send,
810         .pread_recv_fn = vfs_io_uring_pread_recv,
811         .pwrite_send_fn = vfs_io_uring_pwrite_send,
812         .pwrite_recv_fn = vfs_io_uring_pwrite_recv,
813         .fsync_send_fn = vfs_io_uring_fsync_send,
814         .fsync_recv_fn = vfs_io_uring_fsync_recv,
815 };
816
817 static_decl_vfs;
818 NTSTATUS vfs_io_uring_init(TALLOC_CTX *ctx)
819 {
820         return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
821                                 "io_uring", &vfs_io_uring_fns);
822 }