2 Unix SMB/CIFS implementation.
4 main select loop and event handling - aio/epoll hybrid implementation
6 Copyright (C) Andrew Tridgell 2006
8 based on events_standard.c
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 this is a very strange beast. The Linux AIO implementation doesn't
25 yet integrate properly with epoll, but there is a kernel patch that
26 allows the aio wait primitives to be used to wait for epoll events,
27 and this can be used to give us a unified event system incorporating
28 both aio events and epoll events
30 this is _very_ experimental code
33 #include "system/filesys.h"
34 #include "system/network.h"
36 #include "events_internal.h"
37 #include <sys/epoll.h>
40 #define MAX_AIO_QUEUE_DEPTH 100
41 #ifndef IOCB_CMD_EPOLL_WAIT
42 #define IOCB_CMD_EPOLL_WAIT 9
45 struct aio_event_context {
46 /* a pointer back to the generic event_context */
47 struct event_context *ev;
49 /* list of filedescriptor events */
50 struct fd_event *fd_events;
52 /* number of registered fd event handlers */
55 uint32_t destruction_count;
59 struct epoll_event epevent[MAX_AIO_QUEUE_DEPTH];
61 struct iocb *epoll_iocb;
69 struct event_context *event_ctx;
72 event_aio_handler_t handler;
76 map from EVENT_FD_* to EPOLLIN/EPOLLOUT
78 static uint32_t epoll_map_flags(uint16_t flags)
81 if (flags & EVENT_FD_READ) ret |= (EPOLLIN | EPOLLERR | EPOLLHUP);
82 if (flags & EVENT_FD_WRITE) ret |= (EPOLLOUT | EPOLLERR | EPOLLHUP);
89 static int aio_ctx_destructor(struct aio_event_context *aio_ev)
91 io_queue_release(aio_ev->ioctx);
92 close(aio_ev->epoll_fd);
93 aio_ev->epoll_fd = -1;
97 static void epoll_add_event(struct aio_event_context *aio_ev, struct fd_event *fde);
100 reopen the epoll handle when our pid changes
101 see http://junkcode.samba.org/ftp/unpacked/junkcode/epoll_fork.c for an
102 demonstration of why this is needed
104 static void epoll_check_reopen(struct aio_event_context *aio_ev)
106 struct fd_event *fde;
108 if (aio_ev->pid == getpid()) {
112 close(aio_ev->epoll_fd);
113 aio_ev->epoll_fd = epoll_create(MAX_AIO_QUEUE_DEPTH);
114 if (aio_ev->epoll_fd == -1) {
115 ev_debug(aio_ev->ev, EV_DEBUG_FATAL, "Failed to recreate epoll handle after fork\n");
118 aio_ev->pid = getpid();
119 for (fde=aio_ev->fd_events;fde;fde=fde->next) {
120 epoll_add_event(aio_ev, fde);
124 #define EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT (1<<0)
125 #define EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR (1<<1)
126 #define EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR (1<<2)
129 add the epoll event to the given fd_event
131 static void epoll_add_event(struct aio_event_context *aio_ev, struct fd_event *fde)
133 struct epoll_event event;
134 if (aio_ev->epoll_fd == -1) return;
136 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
138 /* if we don't want events yet, don't add an aio_event */
139 if (fde->flags == 0) return;
142 event.events = epoll_map_flags(fde->flags);
143 event.data.ptr = fde;
144 epoll_ctl(aio_ev->epoll_fd, EPOLL_CTL_ADD, fde->fd, &event);
145 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
147 /* only if we want to read we want to tell the event handler about errors */
148 if (fde->flags & EVENT_FD_READ) {
149 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
154 delete the epoll event for given fd_event
156 static void epoll_del_event(struct aio_event_context *aio_ev, struct fd_event *fde)
158 struct epoll_event event;
160 DLIST_REMOVE(aio_ev->fd_events, fde);
162 if (aio_ev->epoll_fd == -1) return;
164 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
166 /* if there's no aio_event, we don't need to delete it */
167 if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT)) return;
170 event.events = epoll_map_flags(fde->flags);
171 event.data.ptr = fde;
172 epoll_ctl(aio_ev->epoll_fd, EPOLL_CTL_DEL, fde->fd, &event);
174 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
178 change the epoll event to the given fd_event
180 static void epoll_mod_event(struct aio_event_context *aio_ev, struct fd_event *fde)
182 struct epoll_event event;
183 if (aio_ev->epoll_fd == -1) return;
185 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
188 event.events = epoll_map_flags(fde->flags);
189 event.data.ptr = fde;
190 epoll_ctl(aio_ev->epoll_fd, EPOLL_CTL_MOD, fde->fd, &event);
192 /* only if we want to read we want to tell the event handler about errors */
193 if (fde->flags & EVENT_FD_READ) {
194 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
198 static void epoll_change_event(struct aio_event_context *aio_ev, struct fd_event *fde)
200 bool got_error = (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR);
201 bool want_read = (fde->flags & EVENT_FD_READ);
202 bool want_write= (fde->flags & EVENT_FD_WRITE);
204 if (aio_ev->epoll_fd == -1) return;
206 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
208 /* there's already an event */
209 if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT) {
210 if (want_read || (want_write && !got_error)) {
211 epoll_mod_event(aio_ev, fde);
214 epoll_del_event(aio_ev, fde);
218 /* there's no aio_event attached to the fde */
219 if (want_read || (want_write && !got_error)) {
220 DLIST_ADD(aio_ev->fd_events, fde);
221 epoll_add_event(aio_ev, fde);
226 static int setup_epoll_wait(struct aio_event_context *aio_ev)
228 if (aio_ev->is_epoll_set) {
231 memset(aio_ev->epoll_iocb, 0, sizeof(*aio_ev->epoll_iocb));
232 aio_ev->epoll_iocb->aio_fildes = aio_ev->epoll_fd;
233 aio_ev->epoll_iocb->aio_lio_opcode = IOCB_CMD_EPOLL_WAIT;
234 aio_ev->epoll_iocb->aio_reqprio = 0;
236 aio_ev->epoll_iocb->u.c.nbytes = MAX_AIO_QUEUE_DEPTH;
237 aio_ev->epoll_iocb->u.c.offset = -1;
238 aio_ev->epoll_iocb->u.c.buf = aio_ev->epevent;
240 if (io_submit(aio_ev->ioctx, 1, &aio_ev->epoll_iocb) != 1) {
243 aio_ev->is_epoll_set = 1;
250 event loop handling using aio/epoll hybrid
252 static int aio_event_loop(struct aio_event_context *aio_ev, struct timeval *tvalp)
255 uint32_t destruction_count = ++aio_ev->destruction_count;
256 struct timespec timeout;
257 struct io_event events[8];
259 if (aio_ev->epoll_fd == -1) return -1;
261 if (aio_ev->ev->num_signal_handlers &&
262 common_event_check_signal(aio_ev->ev)) {
267 timeout.tv_sec = tvalp->tv_sec;
268 timeout.tv_nsec = tvalp->tv_usec;
269 timeout.tv_nsec *= 1000;
272 if (setup_epoll_wait(aio_ev) < 0)
275 ret = io_getevents(aio_ev->ioctx, 1, 8,
276 events, tvalp?&timeout:NULL);
279 if (aio_ev->ev->num_signal_handlers) {
280 common_event_check_signal(aio_ev->ev);
285 if (ret == 0 && tvalp) {
286 /* we don't care about a possible delay here */
287 common_event_loop_timer_delay(aio_ev->ev);
291 for (i=0;i<ret;i++) {
292 struct io_event *event = &events[i];
293 struct iocb *finished = event->obj;
295 switch (finished->aio_lio_opcode) {
298 struct aio_event *ae = talloc_get_type(finished->data,
301 talloc_set_destructor(ae, NULL);
302 ae->handler(ae->event_ctx, ae,
303 event->res, ae->private_data);
308 case IOCB_CMD_EPOLL_WAIT: {
309 struct epoll_event *ep = (struct epoll_event *)finished->u.c.buf;
310 struct fd_event *fde;
314 aio_ev->is_epoll_set = 0;
316 for (j=0; j<event->res; j++, ep++) {
317 fde = talloc_get_type(ep->data.ptr,
322 if (ep->events & (EPOLLHUP|EPOLLERR)) {
323 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR;
324 if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR)) {
325 epoll_del_event(aio_ev, fde);
328 flags |= EVENT_FD_READ;
330 if (ep->events & EPOLLIN) flags |= EVENT_FD_READ;
331 if (ep->events & EPOLLOUT) flags |= EVENT_FD_WRITE;
333 fde->handler(aio_ev->ev, fde, flags, fde->private_data);
339 if (destruction_count != aio_ev->destruction_count) {
348 create a aio_event_context structure.
350 static int aio_event_context_init(struct event_context *ev)
352 struct aio_event_context *aio_ev;
354 aio_ev = talloc_zero(ev, struct aio_event_context);
355 if (!aio_ev) return -1;
358 aio_ev->epoll_iocb = talloc(aio_ev, struct iocb);
360 if (io_queue_init(MAX_AIO_QUEUE_DEPTH, &aio_ev->ioctx) != 0) {
365 aio_ev->epoll_fd = epoll_create(MAX_AIO_QUEUE_DEPTH);
366 if (aio_ev->epoll_fd == -1) {
370 aio_ev->pid = getpid();
372 talloc_set_destructor(aio_ev, aio_ctx_destructor);
374 ev->additional_data = aio_ev;
376 if (setup_epoll_wait(aio_ev) < 0) {
387 static int aio_event_fd_destructor(struct fd_event *fde)
389 struct event_context *ev = fde->event_ctx;
390 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
391 struct aio_event_context);
393 epoll_check_reopen(aio_ev);
395 aio_ev->num_fd_events--;
396 aio_ev->destruction_count++;
398 epoll_del_event(aio_ev, fde);
400 if (fde->flags & EVENT_FD_AUTOCLOSE) {
410 return NULL on failure (memory allocation error)
412 static struct fd_event *aio_event_add_fd(struct event_context *ev, TALLOC_CTX *mem_ctx,
413 int fd, uint16_t flags,
414 event_fd_handler_t handler,
417 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
418 struct aio_event_context);
419 struct fd_event *fde;
421 epoll_check_reopen(aio_ev);
423 fde = talloc(mem_ctx?mem_ctx:ev, struct fd_event);
424 if (!fde) return NULL;
429 fde->handler = handler;
430 fde->private_data = private_data;
431 fde->additional_flags = 0;
432 fde->additional_data = NULL;
434 aio_ev->num_fd_events++;
435 talloc_set_destructor(fde, aio_event_fd_destructor);
437 DLIST_ADD(aio_ev->fd_events, fde);
438 epoll_add_event(aio_ev, fde);
445 return the fd event flags
447 static uint16_t aio_event_get_fd_flags(struct fd_event *fde)
453 set the fd event flags
455 static void aio_event_set_fd_flags(struct fd_event *fde, uint16_t flags)
457 struct event_context *ev;
458 struct aio_event_context *aio_ev;
460 if (fde->flags == flags) return;
463 aio_ev = talloc_get_type(ev->additional_data, struct aio_event_context);
467 epoll_check_reopen(aio_ev);
469 epoll_change_event(aio_ev, fde);
473 do a single event loop using the events defined in ev
475 static int aio_event_loop_once(struct event_context *ev)
477 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
478 struct aio_event_context);
481 tval = common_event_loop_timer_delay(ev);
482 if (timeval_is_zero(&tval)) {
486 epoll_check_reopen(aio_ev);
488 return aio_event_loop(aio_ev, &tval);
492 return on failure or (with 0) if all fd events are removed
494 static int aio_event_loop_wait(struct event_context *ev)
496 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
497 struct aio_event_context);
498 while (aio_ev->num_fd_events) {
499 if (aio_event_loop_once(ev) != 0) {
508 called when a disk IO event needs to be cancelled
510 static int aio_destructor(struct aio_event *ae)
512 struct event_context *ev = ae->event_ctx;
513 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
514 struct aio_event_context);
515 struct io_event result;
516 io_cancel(aio_ev->ioctx, &ae->iocb, &result);
517 /* TODO: handle errors from io_cancel()! */
521 /* submit an aio disk IO event */
522 static struct aio_event *aio_event_add_aio(struct event_context *ev,
525 event_aio_handler_t handler,
528 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
529 struct aio_event_context);
531 struct aio_event *ae = talloc(mem_ctx?mem_ctx:ev, struct aio_event);
532 if (ae == NULL) return NULL;
536 ae->handler = handler;
537 ae->private_data = private_data;
540 if (io_submit(aio_ev->ioctx, 1, &iocbp) != 1) {
545 talloc_set_destructor(ae, aio_destructor);
550 static const struct event_ops aio_event_ops = {
551 .context_init = aio_event_context_init,
552 .add_fd = aio_event_add_fd,
553 .add_aio = aio_event_add_aio,
554 .get_fd_flags = aio_event_get_fd_flags,
555 .set_fd_flags = aio_event_set_fd_flags,
556 .add_timed = common_event_add_timed,
557 .add_signal = common_event_add_signal,
558 .loop_once = aio_event_loop_once,
559 .loop_wait = aio_event_loop_wait,
562 bool events_aio_init(void)
564 return event_register_backend("aio", &aio_event_ops);