vfs_io_uring: move error handling out of vfs_io_uring_pwrite_recv()
[samba.git] / source3 / modules / vfs_io_uring.c
1 /*
2  * Use the io_uring of Linux (>= 5.1)
3  *
4  * Copyright (C) Volker Lendecke 2008
5  * Copyright (C) Jeremy Allison 2010
6  * Copyright (C) Stefan Metzmacher 2019
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  */
22
23 #include "includes.h"
24 #include "system/filesys.h"
25 #include "smbd/smbd.h"
26 #include "smbd/globals.h"
27 #include "lib/util/tevent_unix.h"
28 #include "smbprofile.h"
29 #include <liburing.h>
30
31 struct vfs_io_uring_request;
32
33 struct vfs_io_uring_config {
34         struct io_uring uring;
35         struct tevent_fd *fde;
36         struct vfs_io_uring_request *queue;
37         struct vfs_io_uring_request *pending;
38 };
39
40 struct vfs_io_uring_request {
41         struct vfs_io_uring_request *prev, *next;
42         struct vfs_io_uring_request **list_head;
43         struct vfs_io_uring_config *config;
44         struct tevent_req *req;
45         struct io_uring_sqe sqe;
46         struct io_uring_cqe cqe;
47         void (*completion_fn)(struct vfs_io_uring_request *cur,
48                               const char *location);
49         struct timespec start_time;
50         struct timespec end_time;
51         SMBPROFILE_BYTES_ASYNC_STATE(profile_bytes);
52 };
53
54 static void vfs_io_uring_finish_req(struct vfs_io_uring_request *cur,
55                                     const struct io_uring_cqe *cqe,
56                                     struct timespec end_time,
57                                     const char *location)
58 {
59         struct tevent_req *req =
60                 talloc_get_type_abort(cur->req,
61                 struct tevent_req);
62         void *state = _tevent_req_data(req);
63
64         talloc_set_destructor(state, NULL);
65         if (cur->list_head != NULL) {
66                 DLIST_REMOVE((*cur->list_head), cur);
67                 cur->list_head = NULL;
68         }
69         cur->cqe = *cqe;
70
71         SMBPROFILE_BYTES_ASYNC_SET_IDLE(cur->profile_bytes);
72         cur->end_time = end_time;
73
74         /*
75          * We rely on being inside the _send() function
76          * or tevent_req_defer_callback() being called
77          * already.
78          */
79         cur->completion_fn(cur, location);
80 }
81
82 static void vfs_io_uring_config_destroy(struct vfs_io_uring_config *config,
83                                         int ret,
84                                         const char *location)
85 {
86         struct vfs_io_uring_request *cur = NULL, *next = NULL;
87         struct timespec start_time;
88         struct timespec end_time;
89         struct io_uring_cqe err_cqe = {
90                 .res = ret,
91         };
92
93         PROFILE_TIMESTAMP(&start_time);
94
95         if (config->uring.ring_fd != -1) {
96                 /* TODO: cancel queued and pending requests */
97                 TALLOC_FREE(config->fde);
98                 io_uring_queue_exit(&config->uring);
99                 config->uring.ring_fd = -1;
100         }
101
102         PROFILE_TIMESTAMP(&end_time);
103
104         for (cur = config->pending; cur != NULL; cur = next) {
105                 next = cur->next;
106                 err_cqe.user_data = (uintptr_t)(void *)cur;
107                 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
108         }
109
110         for (cur = config->queue; cur != NULL; cur = next) {
111                 next = cur->next;
112                 err_cqe.user_data = (uintptr_t)(void *)cur;
113                 cur->start_time = start_time;
114                 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
115         }
116 }
117
118 static int vfs_io_uring_config_destructor(struct vfs_io_uring_config *config)
119 {
120         vfs_io_uring_config_destroy(config, -EUCLEAN, __location__);
121         return 0;
122 }
123
124 static int vfs_io_uring_request_state_deny_destructor(void *_state)
125 {
126         struct __vfs_io_uring_generic_state {
127                 struct vfs_io_uring_request ur;
128         } *state = (struct __vfs_io_uring_generic_state *)_state;
129         struct vfs_io_uring_request *cur = &state->ur;
130
131         /* our parent is gone */
132         cur->req = NULL;
133
134         /* remove ourself from any list */
135         DLIST_REMOVE((*cur->list_head), cur);
136         cur->list_head = NULL;
137
138         /*
139          * Our state is about to go away,
140          * all we can do is shutting down the whole uring.
141          * But that's ok as we're most likely called from exit_server()
142          */
143         vfs_io_uring_config_destroy(cur->config, -ESHUTDOWN, __location__);
144         return 0;
145 }
146
147 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
148                                     struct tevent_fd *fde,
149                                     uint16_t flags,
150                                     void *private_data);
151
152 static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
153                             const char *user)
154 {
155         int ret;
156         struct vfs_io_uring_config *config;
157         unsigned num_entries;
158         bool sqpoll;
159         unsigned flags = 0;
160
161         config = talloc_zero(handle->conn, struct vfs_io_uring_config);
162         if (config == NULL) {
163                 DEBUG(0, ("talloc_zero() failed\n"));
164                 return -1;
165         }
166
167         SMB_VFS_HANDLE_SET_DATA(handle, config,
168                                 NULL, struct vfs_io_uring_config,
169                                 return -1);
170
171         ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
172         if (ret < 0) {
173                 return ret;
174         }
175
176         num_entries = lp_parm_ulong(SNUM(handle->conn),
177                                     "io_uring",
178                                     "num_entries",
179                                     128);
180         num_entries = MAX(num_entries, 1);
181
182         sqpoll = lp_parm_bool(SNUM(handle->conn),
183                              "io_uring",
184                              "sqpoll",
185                              false);
186         if (sqpoll) {
187                 flags |= IORING_SETUP_SQPOLL;
188         }
189
190         ret = io_uring_queue_init(num_entries, &config->uring, flags);
191         if (ret < 0) {
192                 SMB_VFS_NEXT_DISCONNECT(handle);
193                 errno = -ret;
194                 return -1;
195         }
196
197         talloc_set_destructor(config, vfs_io_uring_config_destructor);
198
199 #ifdef HAVE_IO_URING_RING_DONTFORK
200         ret = io_uring_ring_dontfork(&config->uring);
201         if (ret < 0) {
202                 SMB_VFS_NEXT_DISCONNECT(handle);
203                 errno = -ret;
204                 return -1;
205         }
206 #endif /* HAVE_IO_URING_RING_DONTFORK */
207
208         config->fde = tevent_add_fd(handle->conn->sconn->ev_ctx,
209                                     config,
210                                     config->uring.ring_fd,
211                                     TEVENT_FD_READ,
212                                     vfs_io_uring_fd_handler,
213                                     handle);
214         if (config->fde == NULL) {
215                 ret = errno;
216                 SMB_VFS_NEXT_DISCONNECT(handle);
217                 errno = ret;
218                 return -1;
219         }
220
221         return 0;
222 }
223
224 static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
225 {
226         struct vfs_io_uring_request *cur = NULL, *next = NULL;
227         struct io_uring_cqe *cqe = NULL;
228         unsigned cqhead;
229         unsigned nr = 0;
230         struct timespec start_time;
231         struct timespec end_time;
232         int ret;
233
234         PROFILE_TIMESTAMP(&start_time);
235
236         if (config->uring.ring_fd == -1) {
237                 vfs_io_uring_config_destroy(config, -ESTALE, __location__);
238                 return;
239         }
240
241         for (cur = config->queue; cur != NULL; cur = next) {
242                 struct io_uring_sqe *sqe = NULL;
243                 void *state = _tevent_req_data(cur->req);
244
245                 next = cur->next;
246
247                 sqe = io_uring_get_sqe(&config->uring);
248                 if (sqe == NULL) {
249                         break;
250                 }
251
252                 talloc_set_destructor(state,
253                         vfs_io_uring_request_state_deny_destructor);
254                 DLIST_REMOVE(config->queue, cur);
255                 *sqe = cur->sqe;
256                 DLIST_ADD_END(config->pending, cur);
257                 cur->list_head = &config->pending;
258                 SMBPROFILE_BYTES_ASYNC_SET_BUSY(cur->profile_bytes);
259
260                 cur->start_time = start_time;
261         }
262
263         ret = io_uring_submit(&config->uring);
264         if (ret == -EAGAIN || ret == -EBUSY) {
265                 /* We just retry later */
266         } else if (ret < 0) {
267                 vfs_io_uring_config_destroy(config, ret, __location__);
268                 return;
269         }
270
271         PROFILE_TIMESTAMP(&end_time);
272
273         io_uring_for_each_cqe(&config->uring, cqhead, cqe) {
274                 cur = (struct vfs_io_uring_request *)io_uring_cqe_get_data(cqe);
275                 vfs_io_uring_finish_req(cur, cqe, end_time, __location__);
276                 nr++;
277         }
278
279         io_uring_cq_advance(&config->uring, nr);
280 }
281
282 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
283                                     struct tevent_fd *fde,
284                                     uint16_t flags,
285                                     void *private_data)
286 {
287         vfs_handle_struct *handle = (vfs_handle_struct *)private_data;
288         struct vfs_io_uring_config *config = NULL;
289
290         SMB_VFS_HANDLE_GET_DATA(handle, config,
291                                 struct vfs_io_uring_config,
292                                 smb_panic(__location__));
293
294         vfs_io_uring_queue_run(config);
295 }
296
297 struct vfs_io_uring_pread_state {
298         struct vfs_io_uring_request ur;
299         struct iovec iov;
300         size_t nread;
301 };
302
303 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
304                                           const char *location);
305
306 static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *handle,
307                                              TALLOC_CTX *mem_ctx,
308                                              struct tevent_context *ev,
309                                              struct files_struct *fsp,
310                                              void *data,
311                                              size_t n, off_t offset)
312 {
313         struct tevent_req *req = NULL;
314         struct vfs_io_uring_pread_state *state = NULL;
315         struct vfs_io_uring_config *config = NULL;
316
317         SMB_VFS_HANDLE_GET_DATA(handle, config,
318                                 struct vfs_io_uring_config,
319                                 smb_panic(__location__));
320
321         req = tevent_req_create(mem_ctx, &state,
322                                 struct vfs_io_uring_pread_state);
323         if (req == NULL) {
324                 return NULL;
325         }
326         state->ur.config = config;
327         state->ur.req = req;
328         state->ur.completion_fn = vfs_io_uring_pread_completion;
329
330         SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pread, profile_p,
331                                      state->ur.profile_bytes, n);
332         SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
333
334         state->iov.iov_base = (void *)data;
335         state->iov.iov_len = n;
336         io_uring_prep_readv(&state->ur.sqe,
337                             fsp->fh->fd,
338                             &state->iov, 1,
339                             offset);
340         io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
341         DLIST_ADD_END(config->queue, &state->ur);
342         state->ur.list_head = &config->queue;
343
344         vfs_io_uring_queue_run(config);
345
346         if (!tevent_req_is_in_progress(req)) {
347                 return tevent_req_post(req, ev);
348         }
349
350         tevent_req_defer_callback(req, ev);
351         return req;
352 }
353
354 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
355                                           const char *location)
356 {
357         struct vfs_io_uring_pread_state *state = tevent_req_data(
358                 cur->req, struct vfs_io_uring_pread_state);
359
360         /*
361          * We rely on being inside the _send() function
362          * or tevent_req_defer_callback() being called
363          * already.
364          */
365
366         if (cur->cqe.res < 0) {
367                 int err = -cur->cqe.res;
368                 _tevent_req_error(cur->req, err, location);
369                 return;
370         }
371
372         state->nread = state->ur.cqe.res;
373         tevent_req_done(cur->req);
374 }
375
376 static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
377                                   struct vfs_aio_state *vfs_aio_state)
378 {
379         struct vfs_io_uring_pread_state *state = tevent_req_data(
380                 req, struct vfs_io_uring_pread_state);
381         ssize_t ret;
382
383         SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
384         vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
385                                                  &state->ur.start_time);
386
387         if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
388                 tevent_req_received(req);
389                 return -1;
390         }
391
392         vfs_aio_state->error = 0;
393         ret = state->nread;
394
395         tevent_req_received(req);
396         return ret;
397 }
398
399 struct vfs_io_uring_pwrite_state {
400         struct vfs_io_uring_request ur;
401         struct iovec iov;
402         size_t nwritten;
403 };
404
405 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
406                                            const char *location);
407
408 static struct tevent_req *vfs_io_uring_pwrite_send(struct vfs_handle_struct *handle,
409                                               TALLOC_CTX *mem_ctx,
410                                               struct tevent_context *ev,
411                                               struct files_struct *fsp,
412                                               const void *data,
413                                               size_t n, off_t offset)
414 {
415         struct tevent_req *req = NULL;
416         struct vfs_io_uring_pwrite_state *state = NULL;
417         struct vfs_io_uring_config *config = NULL;
418
419         SMB_VFS_HANDLE_GET_DATA(handle, config,
420                                 struct vfs_io_uring_config,
421                                 smb_panic(__location__));
422
423         req = tevent_req_create(mem_ctx, &state,
424                                 struct vfs_io_uring_pwrite_state);
425         if (req == NULL) {
426                 return NULL;
427         }
428         state->ur.config = config;
429         state->ur.req = req;
430         state->ur.completion_fn = vfs_io_uring_pwrite_completion;
431
432         SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pwrite, profile_p,
433                                      state->ur.profile_bytes, n);
434         SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
435
436         state->iov.iov_base = discard_const(data);
437         state->iov.iov_len = n;
438         io_uring_prep_writev(&state->ur.sqe,
439                              fsp->fh->fd,
440                              &state->iov, 1,
441                              offset);
442         io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
443         DLIST_ADD_END(config->queue, &state->ur);
444         state->ur.list_head = &config->queue;
445
446         vfs_io_uring_queue_run(config);
447
448         if (!tevent_req_is_in_progress(req)) {
449                 return tevent_req_post(req, ev);
450         }
451
452         tevent_req_defer_callback(req, ev);
453         return req;
454 }
455
456 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
457                                            const char *location)
458 {
459         struct vfs_io_uring_pwrite_state *state = tevent_req_data(
460                 cur->req, struct vfs_io_uring_pwrite_state);
461
462         /*
463          * We rely on being inside the _send() function
464          * or tevent_req_defer_callback() being called
465          * already.
466          */
467
468         if (cur->cqe.res < 0) {
469                 int err = -cur->cqe.res;
470                 _tevent_req_error(cur->req, err, location);
471                 return;
472         }
473
474         state->nwritten = state->ur.cqe.res;
475         tevent_req_done(cur->req);
476 }
477
478 static ssize_t vfs_io_uring_pwrite_recv(struct tevent_req *req,
479                                    struct vfs_aio_state *vfs_aio_state)
480 {
481         struct vfs_io_uring_pwrite_state *state = tevent_req_data(
482                 req, struct vfs_io_uring_pwrite_state);
483         ssize_t ret;
484
485         SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
486         vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
487                                                  &state->ur.start_time);
488
489         if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
490                 tevent_req_received(req);
491                 return -1;
492         }
493
494         vfs_aio_state->error = 0;
495         ret = state->nwritten;
496
497         tevent_req_received(req);
498         return ret;
499 }
500
501 struct vfs_io_uring_fsync_state {
502         struct vfs_io_uring_request ur;
503 };
504
505 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
506                                           const char *location);
507
508 static struct tevent_req *vfs_io_uring_fsync_send(struct vfs_handle_struct *handle,
509                                              TALLOC_CTX *mem_ctx,
510                                              struct tevent_context *ev,
511                                              struct files_struct *fsp)
512 {
513         struct tevent_req *req = NULL;
514         struct vfs_io_uring_fsync_state *state = NULL;
515         struct vfs_io_uring_config *config = NULL;
516
517         SMB_VFS_HANDLE_GET_DATA(handle, config,
518                                 struct vfs_io_uring_config,
519                                 smb_panic(__location__));
520
521         req = tevent_req_create(mem_ctx, &state,
522                                 struct vfs_io_uring_fsync_state);
523         if (req == NULL) {
524                 return NULL;
525         }
526         state->ur.config = config;
527         state->ur.req = req;
528         state->ur.completion_fn = vfs_io_uring_fsync_completion;
529
530         SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
531                                      state->ur.profile_bytes, 0);
532         SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
533
534         io_uring_prep_fsync(&state->ur.sqe,
535                             fsp->fh->fd,
536                             0); /* fsync_flags */
537         io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
538         DLIST_ADD_END(config->queue, &state->ur);
539         state->ur.list_head = &config->queue;
540
541         vfs_io_uring_queue_run(config);
542
543         if (!tevent_req_is_in_progress(req)) {
544                 return tevent_req_post(req, ev);
545         }
546
547         tevent_req_defer_callback(req, ev);
548         return req;
549 }
550
551 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
552                                           const char *location)
553 {
554         /*
555          * We rely on being inside the _send() function
556          * or tevent_req_defer_callback() being called
557          * already.
558          */
559         _tevent_req_done(cur->req, location);
560 }
561
562 static int vfs_io_uring_fsync_recv(struct tevent_req *req,
563                               struct vfs_aio_state *vfs_aio_state)
564 {
565         struct vfs_io_uring_fsync_state *state = tevent_req_data(
566                 req, struct vfs_io_uring_fsync_state);
567         int ret;
568
569         SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
570         vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
571                                                  &state->ur.start_time);
572
573         if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
574                 return -1;
575         }
576
577         if (state->ur.cqe.res < 0) {
578                 vfs_aio_state->error = -state->ur.cqe.res;
579                 ret = -1;
580         } else {
581                 vfs_aio_state->error = 0;
582                 ret = state->ur.cqe.res;
583         }
584
585         tevent_req_received(req);
586         return ret;
587 }
588
589 static struct vfs_fn_pointers vfs_io_uring_fns = {
590         .connect_fn = vfs_io_uring_connect,
591         .pread_send_fn = vfs_io_uring_pread_send,
592         .pread_recv_fn = vfs_io_uring_pread_recv,
593         .pwrite_send_fn = vfs_io_uring_pwrite_send,
594         .pwrite_recv_fn = vfs_io_uring_pwrite_recv,
595         .fsync_send_fn = vfs_io_uring_fsync_send,
596         .fsync_recv_fn = vfs_io_uring_fsync_recv,
597 };
598
599 static_decl_vfs;
600 NTSTATUS vfs_io_uring_init(TALLOC_CTX *ctx)
601 {
602         return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
603                                 "io_uring", &vfs_io_uring_fns);
604 }