2 * Use the io_uring of Linux (>= 5.1)
4 * Copyright (C) Volker Lendecke 2008
5 * Copyright (C) Jeremy Allison 2010
6 * Copyright (C) Stefan Metzmacher 2019
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "system/filesys.h"
25 #include "smbd/smbd.h"
26 #include "smbd/globals.h"
27 #include "lib/util/tevent_unix.h"
28 #include "smbprofile.h"
31 struct vfs_io_uring_request;
33 struct vfs_io_uring_config {
34 struct io_uring uring;
35 struct tevent_fd *fde;
36 struct vfs_io_uring_request *queue;
37 struct vfs_io_uring_request *pending;
40 struct vfs_io_uring_request {
41 struct vfs_io_uring_request *prev, *next;
42 struct vfs_io_uring_request **list_head;
43 struct vfs_io_uring_config *config;
44 struct tevent_req *req;
45 struct io_uring_sqe sqe;
46 struct io_uring_cqe cqe;
47 void (*completion_fn)(struct vfs_io_uring_request *cur,
48 const char *location);
49 struct timespec start_time;
50 struct timespec end_time;
51 SMBPROFILE_BYTES_ASYNC_STATE(profile_bytes);
54 static void vfs_io_uring_finish_req(struct vfs_io_uring_request *cur,
55 const struct io_uring_cqe *cqe,
56 struct timespec end_time,
59 struct tevent_req *req =
60 talloc_get_type_abort(cur->req,
62 void *state = _tevent_req_data(req);
64 talloc_set_destructor(state, NULL);
65 if (cur->list_head != NULL) {
66 DLIST_REMOVE((*cur->list_head), cur);
67 cur->list_head = NULL;
71 SMBPROFILE_BYTES_ASYNC_SET_IDLE(cur->profile_bytes);
72 cur->end_time = end_time;
75 * We rely on being inside the _send() function
76 * or tevent_req_defer_callback() being called
79 cur->completion_fn(cur, location);
82 static void vfs_io_uring_config_destroy(struct vfs_io_uring_config *config,
86 struct vfs_io_uring_request *cur = NULL, *next = NULL;
87 struct timespec start_time;
88 struct timespec end_time;
89 struct io_uring_cqe err_cqe = {
93 PROFILE_TIMESTAMP(&start_time);
95 if (config->uring.ring_fd != -1) {
96 /* TODO: cancel queued and pending requests */
97 TALLOC_FREE(config->fde);
98 io_uring_queue_exit(&config->uring);
99 config->uring.ring_fd = -1;
102 PROFILE_TIMESTAMP(&end_time);
104 for (cur = config->pending; cur != NULL; cur = next) {
106 err_cqe.user_data = (uintptr_t)(void *)cur;
107 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
110 for (cur = config->queue; cur != NULL; cur = next) {
112 err_cqe.user_data = (uintptr_t)(void *)cur;
113 cur->start_time = start_time;
114 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
118 static int vfs_io_uring_config_destructor(struct vfs_io_uring_config *config)
120 vfs_io_uring_config_destroy(config, -EUCLEAN, __location__);
124 static int vfs_io_uring_request_state_deny_destructor(void *_state)
126 struct __vfs_io_uring_generic_state {
127 struct vfs_io_uring_request ur;
128 } *state = (struct __vfs_io_uring_generic_state *)_state;
129 struct vfs_io_uring_request *cur = &state->ur;
131 /* our parent is gone */
134 /* remove ourself from any list */
135 DLIST_REMOVE((*cur->list_head), cur);
136 cur->list_head = NULL;
139 * Our state is about to go away,
140 * all we can do is shutting down the whole uring.
141 * But that's ok as we're most likely called from exit_server()
143 vfs_io_uring_config_destroy(cur->config, -ESHUTDOWN, __location__);
147 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
148 struct tevent_fd *fde,
152 static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
156 struct vfs_io_uring_config *config;
157 unsigned num_entries;
161 config = talloc_zero(handle->conn, struct vfs_io_uring_config);
162 if (config == NULL) {
163 DEBUG(0, ("talloc_zero() failed\n"));
167 SMB_VFS_HANDLE_SET_DATA(handle, config,
168 NULL, struct vfs_io_uring_config,
171 ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
176 num_entries = lp_parm_ulong(SNUM(handle->conn),
180 num_entries = MAX(num_entries, 1);
182 sqpoll = lp_parm_bool(SNUM(handle->conn),
187 flags |= IORING_SETUP_SQPOLL;
190 ret = io_uring_queue_init(num_entries, &config->uring, flags);
192 SMB_VFS_NEXT_DISCONNECT(handle);
197 talloc_set_destructor(config, vfs_io_uring_config_destructor);
199 #ifdef HAVE_IO_URING_RING_DONTFORK
200 ret = io_uring_ring_dontfork(&config->uring);
202 SMB_VFS_NEXT_DISCONNECT(handle);
206 #endif /* HAVE_IO_URING_RING_DONTFORK */
208 config->fde = tevent_add_fd(handle->conn->sconn->ev_ctx,
210 config->uring.ring_fd,
212 vfs_io_uring_fd_handler,
214 if (config->fde == NULL) {
216 SMB_VFS_NEXT_DISCONNECT(handle);
224 static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
226 struct vfs_io_uring_request *cur = NULL, *next = NULL;
227 struct io_uring_cqe *cqe = NULL;
230 struct timespec start_time;
231 struct timespec end_time;
234 PROFILE_TIMESTAMP(&start_time);
236 if (config->uring.ring_fd == -1) {
237 vfs_io_uring_config_destroy(config, -ESTALE, __location__);
241 for (cur = config->queue; cur != NULL; cur = next) {
242 struct io_uring_sqe *sqe = NULL;
243 void *state = _tevent_req_data(cur->req);
247 sqe = io_uring_get_sqe(&config->uring);
252 talloc_set_destructor(state,
253 vfs_io_uring_request_state_deny_destructor);
254 DLIST_REMOVE(config->queue, cur);
256 DLIST_ADD_END(config->pending, cur);
257 cur->list_head = &config->pending;
258 SMBPROFILE_BYTES_ASYNC_SET_BUSY(cur->profile_bytes);
260 cur->start_time = start_time;
263 ret = io_uring_submit(&config->uring);
264 if (ret == -EAGAIN || ret == -EBUSY) {
265 /* We just retry later */
266 } else if (ret < 0) {
267 vfs_io_uring_config_destroy(config, ret, __location__);
271 PROFILE_TIMESTAMP(&end_time);
273 io_uring_for_each_cqe(&config->uring, cqhead, cqe) {
274 cur = (struct vfs_io_uring_request *)io_uring_cqe_get_data(cqe);
275 vfs_io_uring_finish_req(cur, cqe, end_time, __location__);
279 io_uring_cq_advance(&config->uring, nr);
282 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
283 struct tevent_fd *fde,
287 vfs_handle_struct *handle = (vfs_handle_struct *)private_data;
288 struct vfs_io_uring_config *config = NULL;
290 SMB_VFS_HANDLE_GET_DATA(handle, config,
291 struct vfs_io_uring_config,
292 smb_panic(__location__));
294 vfs_io_uring_queue_run(config);
297 struct vfs_io_uring_pread_state {
298 struct vfs_io_uring_request ur;
303 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
304 const char *location);
306 static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *handle,
308 struct tevent_context *ev,
309 struct files_struct *fsp,
311 size_t n, off_t offset)
313 struct tevent_req *req = NULL;
314 struct vfs_io_uring_pread_state *state = NULL;
315 struct vfs_io_uring_config *config = NULL;
317 SMB_VFS_HANDLE_GET_DATA(handle, config,
318 struct vfs_io_uring_config,
319 smb_panic(__location__));
321 req = tevent_req_create(mem_ctx, &state,
322 struct vfs_io_uring_pread_state);
326 state->ur.config = config;
328 state->ur.completion_fn = vfs_io_uring_pread_completion;
330 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pread, profile_p,
331 state->ur.profile_bytes, n);
332 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
334 state->iov.iov_base = (void *)data;
335 state->iov.iov_len = n;
336 io_uring_prep_readv(&state->ur.sqe,
340 io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
341 DLIST_ADD_END(config->queue, &state->ur);
342 state->ur.list_head = &config->queue;
344 vfs_io_uring_queue_run(config);
346 if (!tevent_req_is_in_progress(req)) {
347 return tevent_req_post(req, ev);
350 tevent_req_defer_callback(req, ev);
354 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
355 const char *location)
357 struct vfs_io_uring_pread_state *state = tevent_req_data(
358 cur->req, struct vfs_io_uring_pread_state);
361 * We rely on being inside the _send() function
362 * or tevent_req_defer_callback() being called
366 if (cur->cqe.res < 0) {
367 int err = -cur->cqe.res;
368 _tevent_req_error(cur->req, err, location);
372 state->nread = state->ur.cqe.res;
373 tevent_req_done(cur->req);
376 static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
377 struct vfs_aio_state *vfs_aio_state)
379 struct vfs_io_uring_pread_state *state = tevent_req_data(
380 req, struct vfs_io_uring_pread_state);
383 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
384 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
385 &state->ur.start_time);
387 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
388 tevent_req_received(req);
392 vfs_aio_state->error = 0;
395 tevent_req_received(req);
399 struct vfs_io_uring_pwrite_state {
400 struct vfs_io_uring_request ur;
405 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
406 const char *location);
408 static struct tevent_req *vfs_io_uring_pwrite_send(struct vfs_handle_struct *handle,
410 struct tevent_context *ev,
411 struct files_struct *fsp,
413 size_t n, off_t offset)
415 struct tevent_req *req = NULL;
416 struct vfs_io_uring_pwrite_state *state = NULL;
417 struct vfs_io_uring_config *config = NULL;
419 SMB_VFS_HANDLE_GET_DATA(handle, config,
420 struct vfs_io_uring_config,
421 smb_panic(__location__));
423 req = tevent_req_create(mem_ctx, &state,
424 struct vfs_io_uring_pwrite_state);
428 state->ur.config = config;
430 state->ur.completion_fn = vfs_io_uring_pwrite_completion;
432 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pwrite, profile_p,
433 state->ur.profile_bytes, n);
434 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
436 state->iov.iov_base = discard_const(data);
437 state->iov.iov_len = n;
438 io_uring_prep_writev(&state->ur.sqe,
442 io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
443 DLIST_ADD_END(config->queue, &state->ur);
444 state->ur.list_head = &config->queue;
446 vfs_io_uring_queue_run(config);
448 if (!tevent_req_is_in_progress(req)) {
449 return tevent_req_post(req, ev);
452 tevent_req_defer_callback(req, ev);
456 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
457 const char *location)
459 struct vfs_io_uring_pwrite_state *state = tevent_req_data(
460 cur->req, struct vfs_io_uring_pwrite_state);
463 * We rely on being inside the _send() function
464 * or tevent_req_defer_callback() being called
468 if (cur->cqe.res < 0) {
469 int err = -cur->cqe.res;
470 _tevent_req_error(cur->req, err, location);
474 state->nwritten = state->ur.cqe.res;
475 tevent_req_done(cur->req);
478 static ssize_t vfs_io_uring_pwrite_recv(struct tevent_req *req,
479 struct vfs_aio_state *vfs_aio_state)
481 struct vfs_io_uring_pwrite_state *state = tevent_req_data(
482 req, struct vfs_io_uring_pwrite_state);
485 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
486 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
487 &state->ur.start_time);
489 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
490 tevent_req_received(req);
494 vfs_aio_state->error = 0;
495 ret = state->nwritten;
497 tevent_req_received(req);
501 struct vfs_io_uring_fsync_state {
502 struct vfs_io_uring_request ur;
505 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
506 const char *location);
508 static struct tevent_req *vfs_io_uring_fsync_send(struct vfs_handle_struct *handle,
510 struct tevent_context *ev,
511 struct files_struct *fsp)
513 struct tevent_req *req = NULL;
514 struct vfs_io_uring_fsync_state *state = NULL;
515 struct vfs_io_uring_config *config = NULL;
517 SMB_VFS_HANDLE_GET_DATA(handle, config,
518 struct vfs_io_uring_config,
519 smb_panic(__location__));
521 req = tevent_req_create(mem_ctx, &state,
522 struct vfs_io_uring_fsync_state);
526 state->ur.config = config;
528 state->ur.completion_fn = vfs_io_uring_fsync_completion;
530 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
531 state->ur.profile_bytes, 0);
532 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
534 io_uring_prep_fsync(&state->ur.sqe,
536 0); /* fsync_flags */
537 io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
538 DLIST_ADD_END(config->queue, &state->ur);
539 state->ur.list_head = &config->queue;
541 vfs_io_uring_queue_run(config);
543 if (!tevent_req_is_in_progress(req)) {
544 return tevent_req_post(req, ev);
547 tevent_req_defer_callback(req, ev);
551 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
552 const char *location)
555 * We rely on being inside the _send() function
556 * or tevent_req_defer_callback() being called
559 _tevent_req_done(cur->req, location);
562 static int vfs_io_uring_fsync_recv(struct tevent_req *req,
563 struct vfs_aio_state *vfs_aio_state)
565 struct vfs_io_uring_fsync_state *state = tevent_req_data(
566 req, struct vfs_io_uring_fsync_state);
569 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
570 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
571 &state->ur.start_time);
573 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
577 if (state->ur.cqe.res < 0) {
578 vfs_aio_state->error = -state->ur.cqe.res;
581 vfs_aio_state->error = 0;
582 ret = state->ur.cqe.res;
585 tevent_req_received(req);
589 static struct vfs_fn_pointers vfs_io_uring_fns = {
590 .connect_fn = vfs_io_uring_connect,
591 .pread_send_fn = vfs_io_uring_pread_send,
592 .pread_recv_fn = vfs_io_uring_pread_recv,
593 .pwrite_send_fn = vfs_io_uring_pwrite_send,
594 .pwrite_recv_fn = vfs_io_uring_pwrite_recv,
595 .fsync_send_fn = vfs_io_uring_fsync_send,
596 .fsync_recv_fn = vfs_io_uring_fsync_recv,
600 NTSTATUS vfs_io_uring_init(TALLOC_CTX *ctx)
602 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
603 "io_uring", &vfs_io_uring_fns);