2 * Use the io_uring of Linux (>= 5.1)
4 * Copyright (C) Volker Lendecke 2008
5 * Copyright (C) Jeremy Allison 2010
6 * Copyright (C) Stefan Metzmacher 2019
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "system/filesys.h"
25 #include "smbd/smbd.h"
26 #include "smbd/globals.h"
27 #include "lib/util/tevent_unix.h"
28 #include "lib/util/sys_rw.h"
29 #include "smbprofile.h"
32 struct vfs_io_uring_request;
34 struct vfs_io_uring_config {
35 struct io_uring uring;
36 struct tevent_fd *fde;
37 /* recursion guard. See comment above vfs_io_uring_queue_run() */
39 /* recursion guard. See comment above vfs_io_uring_queue_run() */
41 struct vfs_io_uring_request *queue;
42 struct vfs_io_uring_request *pending;
45 struct vfs_io_uring_request {
46 struct vfs_io_uring_request *prev, *next;
47 struct vfs_io_uring_request **list_head;
48 struct vfs_io_uring_config *config;
49 struct tevent_req *req;
50 struct io_uring_sqe sqe;
51 struct io_uring_cqe cqe;
52 void (*completion_fn)(struct vfs_io_uring_request *cur,
53 const char *location);
54 struct timespec start_time;
55 struct timespec end_time;
56 SMBPROFILE_BYTES_ASYNC_STATE(profile_bytes);
59 static void vfs_io_uring_finish_req(struct vfs_io_uring_request *cur,
60 const struct io_uring_cqe *cqe,
61 struct timespec end_time,
64 struct tevent_req *req =
65 talloc_get_type_abort(cur->req,
67 void *state = _tevent_req_data(req);
69 talloc_set_destructor(state, NULL);
70 if (cur->list_head != NULL) {
71 DLIST_REMOVE((*cur->list_head), cur);
72 cur->list_head = NULL;
76 SMBPROFILE_BYTES_ASYNC_SET_IDLE(cur->profile_bytes);
77 cur->end_time = end_time;
80 * We rely on being inside the _send() function
81 * or tevent_req_defer_callback() being called
84 cur->completion_fn(cur, location);
87 static void vfs_io_uring_config_destroy(struct vfs_io_uring_config *config,
91 struct vfs_io_uring_request *cur = NULL, *next = NULL;
92 struct timespec start_time;
93 struct timespec end_time;
94 struct io_uring_cqe err_cqe = {
98 PROFILE_TIMESTAMP(&start_time);
100 if (config->uring.ring_fd != -1) {
101 /* TODO: cancel queued and pending requests */
102 TALLOC_FREE(config->fde);
103 io_uring_queue_exit(&config->uring);
104 config->uring.ring_fd = -1;
107 PROFILE_TIMESTAMP(&end_time);
109 for (cur = config->pending; cur != NULL; cur = next) {
111 err_cqe.user_data = (uintptr_t)(void *)cur;
112 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
115 for (cur = config->queue; cur != NULL; cur = next) {
117 err_cqe.user_data = (uintptr_t)(void *)cur;
118 cur->start_time = start_time;
119 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
123 static int vfs_io_uring_config_destructor(struct vfs_io_uring_config *config)
125 vfs_io_uring_config_destroy(config, -EUCLEAN, __location__);
129 static int vfs_io_uring_request_state_deny_destructor(void *_state)
131 struct __vfs_io_uring_generic_state {
132 struct vfs_io_uring_request ur;
133 } *state = (struct __vfs_io_uring_generic_state *)_state;
134 struct vfs_io_uring_request *cur = &state->ur;
136 /* our parent is gone */
139 /* remove ourself from any list */
140 DLIST_REMOVE((*cur->list_head), cur);
141 cur->list_head = NULL;
144 * Our state is about to go away,
145 * all we can do is shutting down the whole uring.
146 * But that's ok as we're most likely called from exit_server()
148 vfs_io_uring_config_destroy(cur->config, -ESHUTDOWN, __location__);
152 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
153 struct tevent_fd *fde,
157 static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
161 struct vfs_io_uring_config *config;
162 unsigned num_entries;
166 config = talloc_zero(handle->conn, struct vfs_io_uring_config);
167 if (config == NULL) {
168 DEBUG(0, ("talloc_zero() failed\n"));
172 SMB_VFS_HANDLE_SET_DATA(handle, config,
173 NULL, struct vfs_io_uring_config,
176 ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
181 num_entries = lp_parm_ulong(SNUM(handle->conn),
185 num_entries = MAX(num_entries, 1);
187 sqpoll = lp_parm_bool(SNUM(handle->conn),
192 flags |= IORING_SETUP_SQPOLL;
195 ret = io_uring_queue_init(num_entries, &config->uring, flags);
197 SMB_VFS_NEXT_DISCONNECT(handle);
202 talloc_set_destructor(config, vfs_io_uring_config_destructor);
204 #ifdef HAVE_IO_URING_RING_DONTFORK
205 ret = io_uring_ring_dontfork(&config->uring);
207 SMB_VFS_NEXT_DISCONNECT(handle);
211 #endif /* HAVE_IO_URING_RING_DONTFORK */
213 config->fde = tevent_add_fd(handle->conn->sconn->ev_ctx,
215 config->uring.ring_fd,
217 vfs_io_uring_fd_handler,
219 if (config->fde == NULL) {
221 SMB_VFS_NEXT_DISCONNECT(handle);
229 static void _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
231 struct vfs_io_uring_request *cur = NULL, *next = NULL;
232 struct io_uring_cqe *cqe = NULL;
235 struct timespec start_time;
236 struct timespec end_time;
239 PROFILE_TIMESTAMP(&start_time);
241 if (config->uring.ring_fd == -1) {
242 vfs_io_uring_config_destroy(config, -ESTALE, __location__);
246 for (cur = config->queue; cur != NULL; cur = next) {
247 struct io_uring_sqe *sqe = NULL;
248 void *state = _tevent_req_data(cur->req);
252 sqe = io_uring_get_sqe(&config->uring);
257 talloc_set_destructor(state,
258 vfs_io_uring_request_state_deny_destructor);
259 DLIST_REMOVE(config->queue, cur);
261 DLIST_ADD_END(config->pending, cur);
262 cur->list_head = &config->pending;
263 SMBPROFILE_BYTES_ASYNC_SET_BUSY(cur->profile_bytes);
265 cur->start_time = start_time;
268 ret = io_uring_submit(&config->uring);
269 if (ret == -EAGAIN || ret == -EBUSY) {
270 /* We just retry later */
271 } else if (ret < 0) {
272 vfs_io_uring_config_destroy(config, ret, __location__);
276 PROFILE_TIMESTAMP(&end_time);
278 io_uring_for_each_cqe(&config->uring, cqhead, cqe) {
279 cur = (struct vfs_io_uring_request *)io_uring_cqe_get_data(cqe);
280 vfs_io_uring_finish_req(cur, cqe, end_time, __location__);
284 io_uring_cq_advance(&config->uring, nr);
288 * Wrapper function to prevent recursion which could happen
289 * if we called _vfs_io_uring_queue_run() directly without
292 * Looking at the pread call, we can have:
294 * vfs_io_uring_pread_send()
295 * ->vfs_io_uring_pread_submit() <-----------------------------------
296 * ->vfs_io_uring_request_submit() |
297 * ->vfs_io_uring_queue_run() |
298 * ->_vfs_io_uring_queue_run() |
300 * But inside _vfs_io_uring_queue_run() looks like: |
302 * _vfs_io_uring_queue_run() { |
303 * if (THIS_IO_COMPLETED) { |
304 * ->vfs_io_uring_finish_req() |
305 * ->cur->completion_fn() |
309 * cur->completion_fn() for pread is set to vfs_io_uring_pread_completion() |
311 * vfs_io_uring_pread_completion() { |
312 * if (READ_TERMINATED) { |
313 * -> tevent_req_done() - We're done, go back up the stack. |
317 * We have a short read - adjust the io vectors |
319 * ->vfs_io_uring_pread_submit() ---------------------------------------
322 * So before calling _vfs_io_uring_queue_run() we backet it with setting
323 * a flag config->busy, and unset it once _vfs_io_uring_queue_run() finally
324 * exits the retry loop.
326 * If we end up back into vfs_io_uring_queue_run() we notice we've done so
327 * as config->busy is set and don't recurse into _vfs_io_uring_queue_run().
329 * We set the second flag config->need_retry that tells us to loop in the
330 * vfs_io_uring_queue_run() call above us in the stack and return.
332 * When the outer call to _vfs_io_uring_queue_run() returns we are in
333 * a loop checking if config->need_retry was set. That happens if
334 * the short read case occurs and _vfs_io_uring_queue_run() ended up
335 * recursing into vfs_io_uring_queue_run().
337 * Once vfs_io_uring_pread_completion() finishes without a short
338 * read (the READ_TERMINATED case, tevent_req_done() is called)
339 * then config->need_retry is left as false, we exit the loop,
340 * set config->busy to false so the next top level call into
341 * vfs_io_uring_queue_run() won't think it's a recursed call
346 static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
350 * We've recursed due to short read/write.
351 * Set need_retry to ensure we retry the
354 config->need_retry = true;
359 * Bracket the loop calling _vfs_io_uring_queue_run()
360 * with busy = true / busy = false.
361 * so we can detect recursion above.
367 config->need_retry = false;
368 _vfs_io_uring_queue_run(config);
369 } while (config->need_retry);
371 config->busy = false;
374 static void vfs_io_uring_request_submit(struct vfs_io_uring_request *cur)
376 struct vfs_io_uring_config *config = cur->config;
378 io_uring_sqe_set_data(&cur->sqe, cur);
379 DLIST_ADD_END(config->queue, cur);
380 cur->list_head = &config->queue;
382 vfs_io_uring_queue_run(config);
385 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
386 struct tevent_fd *fde,
390 vfs_handle_struct *handle = (vfs_handle_struct *)private_data;
391 struct vfs_io_uring_config *config = NULL;
393 SMB_VFS_HANDLE_GET_DATA(handle, config,
394 struct vfs_io_uring_config,
395 smb_panic(__location__));
397 vfs_io_uring_queue_run(config);
400 struct vfs_io_uring_pread_state {
401 struct vfs_io_uring_request ur;
406 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
407 const char *location);
409 static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *handle,
411 struct tevent_context *ev,
412 struct files_struct *fsp,
414 size_t n, off_t offset)
416 struct tevent_req *req = NULL;
417 struct vfs_io_uring_pread_state *state = NULL;
418 struct vfs_io_uring_config *config = NULL;
421 SMB_VFS_HANDLE_GET_DATA(handle, config,
422 struct vfs_io_uring_config,
423 smb_panic(__location__));
425 req = tevent_req_create(mem_ctx, &state,
426 struct vfs_io_uring_pread_state);
430 state->ur.config = config;
432 state->ur.completion_fn = vfs_io_uring_pread_completion;
434 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pread, profile_p,
435 state->ur.profile_bytes, n);
436 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
438 ok = sys_valid_io_range(offset, n);
440 tevent_req_error(req, EINVAL);
441 return tevent_req_post(req, ev);
444 state->iov.iov_base = (void *)data;
445 state->iov.iov_len = n;
446 io_uring_prep_readv(&state->ur.sqe,
450 vfs_io_uring_request_submit(&state->ur);
452 if (!tevent_req_is_in_progress(req)) {
453 return tevent_req_post(req, ev);
456 tevent_req_defer_callback(req, ev);
460 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
461 const char *location)
463 struct vfs_io_uring_pread_state *state = tevent_req_data(
464 cur->req, struct vfs_io_uring_pread_state);
467 * We rely on being inside the _send() function
468 * or tevent_req_defer_callback() being called
472 if (cur->cqe.res < 0) {
473 int err = -cur->cqe.res;
474 _tevent_req_error(cur->req, err, location);
478 state->nread = state->ur.cqe.res;
479 tevent_req_done(cur->req);
482 static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
483 struct vfs_aio_state *vfs_aio_state)
485 struct vfs_io_uring_pread_state *state = tevent_req_data(
486 req, struct vfs_io_uring_pread_state);
489 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
490 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
491 &state->ur.start_time);
493 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
494 tevent_req_received(req);
498 vfs_aio_state->error = 0;
501 tevent_req_received(req);
505 struct vfs_io_uring_pwrite_state {
506 struct vfs_io_uring_request ur;
511 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
512 const char *location);
514 static struct tevent_req *vfs_io_uring_pwrite_send(struct vfs_handle_struct *handle,
516 struct tevent_context *ev,
517 struct files_struct *fsp,
519 size_t n, off_t offset)
521 struct tevent_req *req = NULL;
522 struct vfs_io_uring_pwrite_state *state = NULL;
523 struct vfs_io_uring_config *config = NULL;
526 SMB_VFS_HANDLE_GET_DATA(handle, config,
527 struct vfs_io_uring_config,
528 smb_panic(__location__));
530 req = tevent_req_create(mem_ctx, &state,
531 struct vfs_io_uring_pwrite_state);
535 state->ur.config = config;
537 state->ur.completion_fn = vfs_io_uring_pwrite_completion;
539 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pwrite, profile_p,
540 state->ur.profile_bytes, n);
541 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
543 ok = sys_valid_io_range(offset, n);
545 tevent_req_error(req, EINVAL);
546 return tevent_req_post(req, ev);
549 state->iov.iov_base = discard_const(data);
550 state->iov.iov_len = n;
551 io_uring_prep_writev(&state->ur.sqe,
555 vfs_io_uring_request_submit(&state->ur);
557 if (!tevent_req_is_in_progress(req)) {
558 return tevent_req_post(req, ev);
561 tevent_req_defer_callback(req, ev);
565 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
566 const char *location)
568 struct vfs_io_uring_pwrite_state *state = tevent_req_data(
569 cur->req, struct vfs_io_uring_pwrite_state);
572 * We rely on being inside the _send() function
573 * or tevent_req_defer_callback() being called
577 if (cur->cqe.res < 0) {
578 int err = -cur->cqe.res;
579 _tevent_req_error(cur->req, err, location);
583 state->nwritten = state->ur.cqe.res;
584 tevent_req_done(cur->req);
587 static ssize_t vfs_io_uring_pwrite_recv(struct tevent_req *req,
588 struct vfs_aio_state *vfs_aio_state)
590 struct vfs_io_uring_pwrite_state *state = tevent_req_data(
591 req, struct vfs_io_uring_pwrite_state);
594 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
595 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
596 &state->ur.start_time);
598 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
599 tevent_req_received(req);
603 vfs_aio_state->error = 0;
604 ret = state->nwritten;
606 tevent_req_received(req);
610 struct vfs_io_uring_fsync_state {
611 struct vfs_io_uring_request ur;
614 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
615 const char *location);
617 static struct tevent_req *vfs_io_uring_fsync_send(struct vfs_handle_struct *handle,
619 struct tevent_context *ev,
620 struct files_struct *fsp)
622 struct tevent_req *req = NULL;
623 struct vfs_io_uring_fsync_state *state = NULL;
624 struct vfs_io_uring_config *config = NULL;
626 SMB_VFS_HANDLE_GET_DATA(handle, config,
627 struct vfs_io_uring_config,
628 smb_panic(__location__));
630 req = tevent_req_create(mem_ctx, &state,
631 struct vfs_io_uring_fsync_state);
635 state->ur.config = config;
637 state->ur.completion_fn = vfs_io_uring_fsync_completion;
639 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
640 state->ur.profile_bytes, 0);
641 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
643 io_uring_prep_fsync(&state->ur.sqe,
645 0); /* fsync_flags */
646 vfs_io_uring_request_submit(&state->ur);
648 if (!tevent_req_is_in_progress(req)) {
649 return tevent_req_post(req, ev);
652 tevent_req_defer_callback(req, ev);
656 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
657 const char *location)
660 * We rely on being inside the _send() function
661 * or tevent_req_defer_callback() being called
665 if (cur->cqe.res < 0) {
666 int err = -cur->cqe.res;
667 _tevent_req_error(cur->req, err, location);
671 tevent_req_done(cur->req);
674 static int vfs_io_uring_fsync_recv(struct tevent_req *req,
675 struct vfs_aio_state *vfs_aio_state)
677 struct vfs_io_uring_fsync_state *state = tevent_req_data(
678 req, struct vfs_io_uring_fsync_state);
680 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
681 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
682 &state->ur.start_time);
684 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
685 tevent_req_received(req);
689 vfs_aio_state->error = 0;
691 tevent_req_received(req);
695 static struct vfs_fn_pointers vfs_io_uring_fns = {
696 .connect_fn = vfs_io_uring_connect,
697 .pread_send_fn = vfs_io_uring_pread_send,
698 .pread_recv_fn = vfs_io_uring_pread_recv,
699 .pwrite_send_fn = vfs_io_uring_pwrite_send,
700 .pwrite_recv_fn = vfs_io_uring_pwrite_recv,
701 .fsync_send_fn = vfs_io_uring_fsync_send,
702 .fsync_recv_fn = vfs_io_uring_fsync_recv,
706 NTSTATUS vfs_io_uring_init(TALLOC_CTX *ctx)
708 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
709 "io_uring", &vfs_io_uring_fns);