2 Unix SMB/CIFS implementation.
4 main select loop and event handling - epoll implementation
6 Copyright (C) Andrew Tridgell 2003-2005
7 Copyright (C) Stefan Metzmacher 2005-2013
8 Copyright (C) Jeremy Allison 2013
10 ** NOTE! The following LGPL license applies to the tevent
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
29 #include "system/filesys.h"
30 #include "system/select.h"
32 #include "tevent_internal.h"
33 #include "tevent_util.h"
35 struct epoll_event_context {
36 /* a pointer back to the generic event_context */
37 struct tevent_context *ev;
39 /* when using epoll this is the handle from epoll_create */
44 bool panic_force_replay;
46 bool (*panic_fallback)(struct tevent_context *ev, bool replay);
49 #define EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT (1<<0)
50 #define EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR (1<<1)
51 #define EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR (1<<2)
52 #define EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX (1<<3)
54 #ifdef TEST_PANIC_FALLBACK
56 static int epoll_create_panic_fallback(struct epoll_event_context *epoll_ev,
59 if (epoll_ev->panic_fallback == NULL) {
60 return epoll_create(size);
63 /* 50% of the time, fail... */
64 if ((random() % 2) == 0) {
69 return epoll_create(size);
72 static int epoll_ctl_panic_fallback(struct epoll_event_context *epoll_ev,
73 int epfd, int op, int fd,
74 struct epoll_event *event)
76 if (epoll_ev->panic_fallback == NULL) {
77 return epoll_ctl(epfd, op, fd, event);
80 /* 50% of the time, fail... */
81 if ((random() % 2) == 0) {
86 return epoll_ctl(epfd, op, fd, event);
89 static int epoll_wait_panic_fallback(struct epoll_event_context *epoll_ev,
91 struct epoll_event *events,
95 if (epoll_ev->panic_fallback == NULL) {
96 return epoll_wait(epfd, events, maxevents, timeout);
99 /* 50% of the time, fail... */
100 if ((random() % 2) == 0) {
105 return epoll_wait(epfd, events, maxevents, timeout);
108 #define epoll_create(_size) \
109 epoll_create_panic_fallback(epoll_ev, _size)
110 #define epoll_ctl(_epfd, _op, _fd, _event) \
111 epoll_ctl_panic_fallback(epoll_ev,_epfd, _op, _fd, _event)
112 #define epoll_wait(_epfd, _events, _maxevents, _timeout) \
113 epoll_wait_panic_fallback(epoll_ev, _epfd, _events, _maxevents, _timeout)
117 called to set the panic fallback function.
119 _PRIVATE_ bool tevent_epoll_set_panic_fallback(struct tevent_context *ev,
120 bool (*panic_fallback)(struct tevent_context *ev,
123 struct epoll_event_context *epoll_ev;
125 if (ev->additional_data == NULL) {
129 epoll_ev = talloc_get_type(ev->additional_data,
130 struct epoll_event_context);
131 if (epoll_ev == NULL) {
134 epoll_ev->panic_fallback = panic_fallback;
139 called when a epoll call fails
141 static void epoll_panic(struct epoll_event_context *epoll_ev,
142 const char *reason, bool replay)
144 struct tevent_context *ev = epoll_ev->ev;
145 bool (*panic_fallback)(struct tevent_context *ev, bool replay);
147 panic_fallback = epoll_ev->panic_fallback;
149 if (epoll_ev->panic_state != NULL) {
150 *epoll_ev->panic_state = true;
153 if (epoll_ev->panic_force_replay) {
157 TALLOC_FREE(ev->additional_data);
159 if (panic_fallback == NULL) {
160 tevent_debug(ev, TEVENT_DEBUG_FATAL,
161 "%s (%s) replay[%u] - calling abort()\n",
162 reason, strerror(errno), (unsigned)replay);
166 tevent_debug(ev, TEVENT_DEBUG_WARNING,
167 "%s (%s) replay[%u] - calling panic_fallback\n",
168 reason, strerror(errno), (unsigned)replay);
170 if (!panic_fallback(ev, replay)) {
171 /* Fallback failed. */
172 tevent_debug(ev, TEVENT_DEBUG_FATAL,
173 "%s (%s) replay[%u] - calling abort()\n",
174 reason, strerror(errno), (unsigned)replay);
180 map from TEVENT_FD_* to EPOLLIN/EPOLLOUT
182 static uint32_t epoll_map_flags(uint16_t flags)
185 if (flags & TEVENT_FD_READ) ret |= (EPOLLIN | EPOLLERR | EPOLLHUP);
186 if (flags & TEVENT_FD_WRITE) ret |= (EPOLLOUT | EPOLLERR | EPOLLHUP);
193 static int epoll_ctx_destructor(struct epoll_event_context *epoll_ev)
195 close(epoll_ev->epoll_fd);
196 epoll_ev->epoll_fd = -1;
203 static int epoll_init_ctx(struct epoll_event_context *epoll_ev)
205 epoll_ev->epoll_fd = epoll_create(64);
206 if (epoll_ev->epoll_fd == -1) {
207 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
208 "Failed to create epoll handle.\n");
212 if (!ev_set_close_on_exec(epoll_ev->epoll_fd)) {
213 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING,
214 "Failed to set close-on-exec, file descriptor may be leaked to children.\n");
217 epoll_ev->pid = getpid();
218 talloc_set_destructor(epoll_ev, epoll_ctx_destructor);
223 static void epoll_update_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde);
226 reopen the epoll handle when our pid changes
227 see http://junkcode.samba.org/ftp/unpacked/junkcode/epoll_fork.c for an
228 demonstration of why this is needed
230 static void epoll_check_reopen(struct epoll_event_context *epoll_ev)
232 struct tevent_fd *fde;
233 bool *caller_panic_state = epoll_ev->panic_state;
234 bool panic_triggered = false;
236 if (epoll_ev->pid == getpid()) {
240 close(epoll_ev->epoll_fd);
241 epoll_ev->epoll_fd = epoll_create(64);
242 if (epoll_ev->epoll_fd == -1) {
243 epoll_panic(epoll_ev, "epoll_create() failed", false);
247 if (!ev_set_close_on_exec(epoll_ev->epoll_fd)) {
248 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING,
249 "Failed to set close-on-exec, file descriptor may be leaked to children.\n");
252 epoll_ev->pid = getpid();
253 epoll_ev->panic_state = &panic_triggered;
254 for (fde=epoll_ev->ev->fd_events;fde;fde=fde->next) {
255 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
256 epoll_update_event(epoll_ev, fde);
258 if (panic_triggered) {
259 if (caller_panic_state != NULL) {
260 *caller_panic_state = true;
265 epoll_ev->panic_state = NULL;
269 epoll cannot add the same file descriptor twice, once
270 with read, once with write which is allowed by the
271 tevent backend. Multiplex the existing fde, flag it
272 as such so we can search for the correct fde on
276 static int epoll_add_multiplex_fd(struct epoll_event_context *epoll_ev,
277 struct tevent_fd *add_fde)
279 struct epoll_event event;
280 struct tevent_fd *mpx_fde;
283 /* Find the existing fde that caused the EEXIST error. */
284 for (mpx_fde = epoll_ev->ev->fd_events; mpx_fde; mpx_fde = mpx_fde->next) {
285 if (mpx_fde->fd != add_fde->fd) {
289 if (mpx_fde == add_fde) {
295 if (mpx_fde == NULL) {
296 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
297 "can't find multiplex fde for fd[%d]",
302 if (mpx_fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) {
303 /* Logic error. Can't have more than 2 multiplexed fde's. */
304 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
305 "multiplex fde for fd[%d] is already multiplexed\n",
311 * The multiplex fde must have the same fd, and also
312 * already have an epoll event attached.
314 if (!(mpx_fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT)) {
315 /* Logic error. Can't have more than 2 multiplexed fde's. */
316 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
317 "multiplex fde for fd[%d] has no event\n",
322 /* Modify the mpx_fde to add in the new flags. */
324 event.events = epoll_map_flags(mpx_fde->flags);
325 event.events |= epoll_map_flags(add_fde->flags);
326 event.data.ptr = mpx_fde;
327 ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_MOD, mpx_fde->fd, &event);
328 if (ret != 0 && errno == EBADF) {
329 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_ERROR,
330 "EPOLL_CTL_MOD EBADF for "
331 "add_fde[%p] mpx_fde[%p] fd[%d] - disabling\n",
332 add_fde, mpx_fde, add_fde->fd);
333 DLIST_REMOVE(epoll_ev->ev->fd_events, mpx_fde);
334 mpx_fde->event_ctx = NULL;
335 DLIST_REMOVE(epoll_ev->ev->fd_events, add_fde);
336 add_fde->event_ctx = NULL;
338 } else if (ret != 0) {
343 * Make each fde->additional_data pointers point at each other
344 * so we can look them up from each other. They are now paired.
346 mpx_fde->additional_data = (struct tevent_fd *)add_fde;
347 add_fde->additional_data = (struct tevent_fd *)mpx_fde;
349 /* Now flag both fde's as being multiplexed. */
350 mpx_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX;
351 add_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX;
353 /* we need to keep the GOT_ERROR flag */
354 if (mpx_fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR) {
355 add_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR;
362 add the epoll event to the given fd_event
364 static void epoll_add_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
366 struct epoll_event event;
369 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
370 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
373 event.events = epoll_map_flags(fde->flags);
374 event.data.ptr = fde;
375 ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_ADD, fde->fd, &event);
376 if (ret != 0 && errno == EBADF) {
377 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_ERROR,
378 "EPOLL_CTL_ADD EBADF for "
379 "fde[%p] fd[%d] - disabling\n",
381 DLIST_REMOVE(epoll_ev->ev->fd_events, fde);
382 fde->event_ctx = NULL;
384 } else if (ret != 0) {
385 epoll_panic(epoll_ev, "EPOLL_CTL_ADD failed", false);
389 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
390 /* only if we want to read we want to tell the event handler about errors */
391 if (fde->flags & TEVENT_FD_READ) {
392 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
397 delete the epoll event for given fd_event
399 static void epoll_del_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
401 struct epoll_event event;
404 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
405 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
408 ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_DEL, fde->fd, &event);
409 if (ret != 0 && errno == ENOENT) {
411 * This can happen after a epoll_check_reopen
412 * within epoll_event_fd_destructor.
414 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_TRACE,
415 "EPOLL_CTL_DEL ignoring ENOENT for fd[%d]\n",
418 } else if (ret != 0 && errno == EBADF) {
419 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING,
420 "EPOLL_CTL_DEL EBADF for "
421 "fde[%p] fd[%d] - disabling\n",
423 DLIST_REMOVE(epoll_ev->ev->fd_events, fde);
424 fde->event_ctx = NULL;
426 } else if (ret != 0) {
427 epoll_panic(epoll_ev, "EPOLL_CTL_DEL failed", false);
433 change the epoll event to the given fd_event
435 static void epoll_mod_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
437 struct epoll_event event;
440 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
441 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
444 event.events = epoll_map_flags(fde->flags);
445 event.data.ptr = fde;
446 ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_MOD, fde->fd, &event);
447 if (ret != 0 && errno == EBADF) {
448 tevent_debug(epoll_ev->ev, TEVENT_DEBUG_ERROR,
449 "EPOLL_CTL_MOD EBADF for "
450 "fde[%p] fd[%d] - disabling\n",
452 DLIST_REMOVE(epoll_ev->ev->fd_events, fde);
453 fde->event_ctx = NULL;
455 } else if (ret != 0) {
456 epoll_panic(epoll_ev, "EPOLL_CTL_MOD failed", false);
460 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
461 /* only if we want to read we want to tell the event handler about errors */
462 if (fde->flags & TEVENT_FD_READ) {
463 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
467 static void epoll_update_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
469 bool got_error = (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR);
470 bool want_read = (fde->flags & TEVENT_FD_READ);
471 bool want_write= (fde->flags & TEVENT_FD_WRITE);
473 /* there's already an event */
474 if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT) {
475 if (want_read || (want_write && !got_error)) {
476 epoll_mod_event(epoll_ev, fde);
480 * if we want to match the select behavior, we need to remove the epoll_event
481 * when the caller isn't interested in events.
483 * this is because epoll reports EPOLLERR and EPOLLHUP, even without asking for them
485 epoll_del_event(epoll_ev, fde);
489 /* there's no epoll_event attached to the fde */
490 if (want_read || (want_write && !got_error)) {
491 epoll_add_event(epoll_ev, fde);
497 event loop handling using epoll
499 static int epoll_event_loop(struct epoll_event_context *epoll_ev, struct timeval *tvalp)
503 struct epoll_event events[MAXEVENTS];
508 /* it's better to trigger timed events a bit later than too early */
509 timeout = ((tvalp->tv_usec+999) / 1000) + (tvalp->tv_sec*1000);
512 if (epoll_ev->ev->signal_events &&
513 tevent_common_check_signal(epoll_ev->ev)) {
517 tevent_trace_point_callback(epoll_ev->ev, TEVENT_TRACE_BEFORE_WAIT);
518 ret = epoll_wait(epoll_ev->epoll_fd, events, MAXEVENTS, timeout);
520 tevent_trace_point_callback(epoll_ev->ev, TEVENT_TRACE_AFTER_WAIT);
522 if (ret == -1 && wait_errno == EINTR && epoll_ev->ev->signal_events) {
523 if (tevent_common_check_signal(epoll_ev->ev)) {
528 if (ret == -1 && wait_errno != EINTR) {
529 epoll_panic(epoll_ev, "epoll_wait() failed", true);
533 if (ret == 0 && tvalp) {
534 /* we don't care about a possible delay here */
535 tevent_common_loop_timer_delay(epoll_ev->ev);
539 for (i=0;i<ret;i++) {
540 struct tevent_fd *fde = talloc_get_type(events[i].data.ptr,
545 epoll_panic(epoll_ev, "epoll_wait() gave bad data", true);
548 if (events[i].events & (EPOLLHUP|EPOLLERR)) {
549 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR;
551 * if we only wait for TEVENT_FD_WRITE, we should not tell the
552 * event handler about it, and remove the epoll_event,
553 * as we only report errors when waiting for read events,
554 * to match the select() behavior
556 if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR)) {
557 epoll_update_event(epoll_ev, fde);
560 flags |= TEVENT_FD_READ;
562 if (events[i].events & EPOLLIN) flags |= TEVENT_FD_READ;
563 if (events[i].events & EPOLLOUT) flags |= TEVENT_FD_WRITE;
565 fde->handler(epoll_ev->ev, fde, flags, fde->private_data);
574 create a epoll_event_context structure.
576 static int epoll_event_context_init(struct tevent_context *ev)
579 struct epoll_event_context *epoll_ev;
582 * We might be called during tevent_re_initialise()
583 * which means we need to free our old additional_data.
585 TALLOC_FREE(ev->additional_data);
587 epoll_ev = talloc_zero(ev, struct epoll_event_context);
588 if (!epoll_ev) return -1;
590 epoll_ev->epoll_fd = -1;
592 ret = epoll_init_ctx(epoll_ev);
594 talloc_free(epoll_ev);
598 ev->additional_data = epoll_ev;
605 static int epoll_event_fd_destructor(struct tevent_fd *fde)
607 struct tevent_context *ev = fde->event_ctx;
608 struct epoll_event_context *epoll_ev = NULL;
609 bool panic_triggered = false;
610 int flags = fde->flags;
613 return tevent_common_fd_destructor(fde);
616 epoll_ev = talloc_get_type_abort(ev->additional_data,
617 struct epoll_event_context);
620 * we must remove the event from the list
621 * otherwise a panic fallback handler may
622 * reuse invalid memory
624 DLIST_REMOVE(ev->fd_events, fde);
626 epoll_ev->panic_state = &panic_triggered;
627 epoll_check_reopen(epoll_ev);
628 if (panic_triggered) {
629 return tevent_common_fd_destructor(fde);
633 epoll_update_event(epoll_ev, fde);
635 if (panic_triggered) {
636 return tevent_common_fd_destructor(fde);
638 epoll_ev->panic_state = NULL;
640 return tevent_common_fd_destructor(fde);
645 return NULL on failure (memory allocation error)
647 static struct tevent_fd *epoll_event_add_fd(struct tevent_context *ev, TALLOC_CTX *mem_ctx,
648 int fd, uint16_t flags,
649 tevent_fd_handler_t handler,
651 const char *handler_name,
652 const char *location)
654 struct epoll_event_context *epoll_ev = talloc_get_type(ev->additional_data,
655 struct epoll_event_context);
656 struct tevent_fd *fde;
657 bool panic_triggered = false;
659 fde = tevent_common_add_fd(ev, mem_ctx, fd, flags,
660 handler, private_data,
661 handler_name, location);
662 if (!fde) return NULL;
664 talloc_set_destructor(fde, epoll_event_fd_destructor);
666 epoll_ev->panic_state = &panic_triggered;
667 epoll_check_reopen(epoll_ev);
668 if (panic_triggered) {
671 epoll_ev->panic_state = NULL;
673 epoll_update_event(epoll_ev, fde);
679 set the fd event flags
681 static void epoll_event_set_fd_flags(struct tevent_fd *fde, uint16_t flags)
683 struct tevent_context *ev;
684 struct epoll_event_context *epoll_ev;
685 bool panic_triggered = false;
687 if (fde->flags == flags) return;
690 epoll_ev = talloc_get_type(ev->additional_data, struct epoll_event_context);
694 epoll_ev->panic_state = &panic_triggered;
695 epoll_check_reopen(epoll_ev);
696 if (panic_triggered) {
699 epoll_ev->panic_state = NULL;
701 epoll_update_event(epoll_ev, fde);
705 do a single event loop using the events defined in ev
707 static int epoll_event_loop_once(struct tevent_context *ev, const char *location)
709 struct epoll_event_context *epoll_ev = talloc_get_type(ev->additional_data,
710 struct epoll_event_context);
712 bool panic_triggered = false;
714 if (ev->signal_events &&
715 tevent_common_check_signal(ev)) {
719 if (ev->immediate_events &&
720 tevent_common_loop_immediate(ev)) {
724 tval = tevent_common_loop_timer_delay(ev);
725 if (tevent_timeval_is_zero(&tval)) {
729 epoll_ev->panic_state = &panic_triggered;
730 epoll_ev->panic_force_replay = true;
731 epoll_check_reopen(epoll_ev);
732 if (panic_triggered) {
736 epoll_ev->panic_force_replay = false;
737 epoll_ev->panic_state = NULL;
739 return epoll_event_loop(epoll_ev, &tval);
742 static const struct tevent_ops epoll_event_ops = {
743 .context_init = epoll_event_context_init,
744 .add_fd = epoll_event_add_fd,
745 .set_fd_close_fn = tevent_common_fd_set_close_fn,
746 .get_fd_flags = tevent_common_fd_get_flags,
747 .set_fd_flags = epoll_event_set_fd_flags,
748 .add_timer = tevent_common_add_timer,
749 .schedule_immediate = tevent_common_schedule_immediate,
750 .add_signal = tevent_common_add_signal,
751 .loop_once = epoll_event_loop_once,
752 .loop_wait = tevent_common_loop_wait,
755 _PRIVATE_ bool tevent_epoll_init(void)
757 return tevent_register_backend("epoll", &epoll_event_ops);